提交 c93faf44 编写于 作者: S smallv0221

Merge branch 'develop' of https://github.com/PaddlePaddle/models into yxp1222

......@@ -15,7 +15,7 @@ import paddle
import paddle.fluid as fluid
import utils.utility as utility
AMP_MODEL_LIST = ["ResNet50", "SE_ResNet50_vd"]
AMP_MODEL_LIST = ["ResNet50", "SE_ResNet50_vd", "ResNet200_vd"]
def _calc_label_smoothing_loss(softmax_out, label, class_dim, epsilon):
......
......@@ -23,7 +23,8 @@ import paddle.fluid as fluid
from paddle.fluid.param_attr import ParamAttr
__all__ = [
"ResNet", "ResNet18_vd", "ResNet34_vd", "ResNet50_vd", "ResNet101_vd", "ResNet152_vd", "ResNet200_vd"
"ResNet", "ResNet18_vd", "ResNet34_vd", "ResNet50_vd", "ResNet101_vd",
"ResNet152_vd", "ResNet200_vd"
]
......@@ -32,7 +33,7 @@ class ResNet():
self.layers = layers
self.is_3x3 = is_3x3
def net(self, input, class_dim=1000):
def net(self, input, class_dim=1000, data_format="NCHW"):
is_3x3 = self.is_3x3
layers = self.layers
supported_layers = [18, 34, 50, 101, 152, 200]
......@@ -40,7 +41,7 @@ class ResNet():
"supported layers are {} but input layer is {}".format(supported_layers, layers)
if layers == 18:
depth = [2, 2, 2, 2]
depth = [2, 2, 2, 2]
elif layers == 34 or layers == 50:
depth = [3, 4, 6, 3]
elif layers == 101:
......@@ -56,7 +57,8 @@ class ResNet():
num_filters=64,
filter_size=7,
stride=2,
act='relu')
act='relu',
data_format=data_format)
else:
conv = self.conv_bn_layer(
input=input,
......@@ -64,29 +66,33 @@ class ResNet():
filter_size=3,
stride=2,
act='relu',
name='conv1_1')
name='conv1_1',
data_format=data_format)
conv = self.conv_bn_layer(
input=conv,
num_filters=32,
filter_size=3,
stride=1,
act='relu',
name='conv1_2')
name='conv1_2',
data_format=data_format)
conv = self.conv_bn_layer(
input=conv,
num_filters=64,
filter_size=3,
stride=1,
act='relu',
name='conv1_3')
name='conv1_3',
data_format=data_format)
conv = fluid.layers.pool2d(
input=conv,
pool_size=3,
pool_stride=2,
pool_padding=1,
pool_type='max')
pool_type='max',
data_format=data_format)
if layers >= 50:
for block in range(len(depth)):
for i in range(depth[block]):
......@@ -101,22 +107,29 @@ class ResNet():
input=conv,
num_filters=num_filters[block],
stride=2 if i == 0 and block != 0 else 1,
if_first=block==i==0,
name=conv_name)
if_first=block == i == 0,
name=conv_name,
data_format=data_format)
else:
for block in range(len(depth)):
for i in range(depth[block]):
conv_name="res"+str(block+2)+chr(97+i)
conv_name = "res" + str(block + 2) + chr(97 + i)
conv = self.basic_block(
input=conv,
num_filters=num_filters[block],
stride=2 if i == 0 and block != 0 else 1,
if_first=block==i==0,
name=conv_name)
if_first=block == i == 0,
name=conv_name,
data_format=data_format)
pool = fluid.layers.pool2d(
input=conv, pool_type='avg', global_pooling=True)
stdv = 1.0 / math.sqrt(pool.shape[1] * 1.0)
input=conv,
pool_type='avg',
global_pooling=True,
data_format=data_format)
pool_channel = pool.shape[1] if data_format == "NCHW" else pool.shape[
-1]
stdv = 1.0 / math.sqrt(pool_channel * 1.0)
out = fluid.layers.fc(
input=pool,
......@@ -133,7 +146,8 @@ class ResNet():
stride=1,
groups=1,
act=None,
name=None):
name=None,
data_format="NCHW"):
conv = fluid.layers.conv2d(
input=input,
num_filters=num_filters,
......@@ -143,7 +157,8 @@ class ResNet():
groups=groups,
act=None,
param_attr=ParamAttr(name=name + "_weights"),
bias_attr=False)
bias_attr=False,
data_format=data_format)
if name == "conv1":
bn_name = "bn_" + name
else:
......@@ -154,7 +169,8 @@ class ResNet():
param_attr=ParamAttr(name=bn_name + '_scale'),
bias_attr=ParamAttr(bn_name + '_offset'),
moving_mean_name=bn_name + '_mean',
moving_variance_name=bn_name + '_variance')
moving_variance_name=bn_name + '_variance',
data_layout=data_format)
def conv_bn_layer_new(self,
input,
......@@ -163,14 +179,16 @@ class ResNet():
stride=1,
groups=1,
act=None,
name=None):
name=None,
data_format="NCHW"):
pool = fluid.layers.pool2d(
input=input,
pool_size=2,
pool_stride=2,
pool_padding=0,
pool_type='avg',
ceil_mode=True)
ceil_mode=True,
data_format=data_format)
conv = fluid.layers.conv2d(
input=pool,
......@@ -181,7 +199,8 @@ class ResNet():
groups=groups,
act=None,
param_attr=ParamAttr(name=name + "_weights"),
bias_attr=False)
bias_attr=False,
data_format=data_format)
if name == "conv1":
bn_name = "bn_" + name
else:
......@@ -192,81 +211,114 @@ class ResNet():
param_attr=ParamAttr(name=bn_name + '_scale'),
bias_attr=ParamAttr(bn_name + '_offset'),
moving_mean_name=bn_name + '_mean',
moving_variance_name=bn_name + '_variance')
moving_variance_name=bn_name + '_variance',
data_layout=data_format)
def shortcut(self, input, ch_out, stride, name, if_first=False):
ch_in = input.shape[1]
def shortcut(self,
input,
ch_out,
stride,
name,
if_first=False,
data_format="NCHW"):
ch_in = input.shape[1] if data_format == "NCHW" else input.shape[-1]
if ch_in != ch_out or stride != 1:
if if_first:
return self.conv_bn_layer(input, ch_out, 1, stride, name=name)
return self.conv_bn_layer(
input,
ch_out,
1,
stride,
name=name,
data_format=data_format)
else:
return self.conv_bn_layer_new(input, ch_out, 1, stride, name=name)
return self.conv_bn_layer_new(
input,
ch_out,
1,
stride,
name=name,
data_format=data_format)
elif if_first:
return self.conv_bn_layer(input, ch_out, 1, stride, name=name)
return self.conv_bn_layer(
input, ch_out, 1, stride, name=name, data_format=data_format)
else:
return input
def bottleneck_block(self, input, num_filters, stride, name, if_first):
def bottleneck_block(self,
input,
num_filters,
stride,
name,
if_first,
data_format="NCHW"):
conv0 = self.conv_bn_layer(
input=input,
num_filters=num_filters,
filter_size=1,
act='relu',
name=name + "_branch2a")
name=name + "_branch2a",
data_format=data_format)
conv1 = self.conv_bn_layer(
input=conv0,
num_filters=num_filters,
filter_size=3,
stride=stride,
act='relu',
name=name + "_branch2b")
name=name + "_branch2b",
data_format=data_format)
conv2 = self.conv_bn_layer(
input=conv1,
num_filters=num_filters * 4,
filter_size=1,
act=None,
name=name + "_branch2c")
name=name + "_branch2c",
data_format=data_format)
short = self.shortcut(
input,
num_filters * 4,
stride,
if_first=if_first,
name=name + "_branch1")
name=name + "_branch1",
data_format=data_format)
return fluid.layers.elementwise_add(x=short, y=conv2, act='relu')
def basic_block(self, input, num_filters, stride, name, if_first):
def basic_block(self, input, num_filters, stride, name, if_first,
data_format):
conv0 = self.conv_bn_layer(
input=input,
num_filters=num_filters,
filter_size=3,
act='relu',
input=input,
num_filters=num_filters,
filter_size=3,
act='relu',
stride=stride,
name=name+"_branch2a")
name=name + "_branch2a",
data_format=data_format)
conv1 = self.conv_bn_layer(
input=conv0,
num_filters=num_filters,
filter_size=3,
act=None,
name=name+"_branch2b")
input=conv0,
num_filters=num_filters,
filter_size=3,
act=None,
name=name + "_branch2b",
data_format=data_format)
short = self.shortcut(
input,
num_filters,
stride,
if_first=if_first,
name=name + "_branch1")
input,
num_filters,
stride,
if_first=if_first,
name=name + "_branch1",
data_format=data_format)
return fluid.layers.elementwise_add(x=short, y=conv1, act='relu')
def ResNet18_vd():
model=ResNet(layers=18, is_3x3=True)
model = ResNet(layers=18, is_3x3=True)
return model
def ResNet34_vd():
model=ResNet(layers=34, is_3x3=True)
model = ResNet(layers=34, is_3x3=True)
return model
......
#!/bin/bash -ex
#Training details
export FLAGS_conv_workspace_size_limit=4000 #MB
export FLAGS_cudnn_exhaustive_search=1
export FLAGS_cudnn_batchnorm_spatial_persistent=1
DATA_DIR="Your image dataset path, e.g. ./data/ILSVRC2012/"
DATA_FORMAT="NHWC"
USE_AMP=true #whether to use amp
USE_DALI=true
USE_ADDTO=true
if ${USE_ADDTO} ;then
export FLAGS_max_inplace_grad_add=8
fi
if ${USE_DALI}; then
export FLAGS_fraction_of_gpu_memory_to_use=0.8
fi
python train.py \
--model=ResNet200_vd \
--data_dir=${DATA_DIR} \
--batch_size=64 \
--num_epochs=200 \
--total_images=1281167 \
--image_shape 4 224 224 \
--class_dim=1000 \
--print_step=10 \
--model_save_dir=output/ \
--lr_strategy=cosine_decay \
--use_amp=${USE_AMP} \
--scale_loss=128.0 \
--use_dynamic_loss_scaling=true \
--data_format=${DATA_FORMAT} \
--fuse_elewise_add_act_ops=true \
--fuse_bn_act_ops=true \
--fuse_bn_add_act_ops=true \
--enable_addto=${USE_ADDTO} \
--validate=true \
--is_profiler=false \
--profiler_path=profile/ \
--reader_thread=10 \
--reader_buf_size=4000 \
--use_dali=${USE_DALI} \
--lr=0.1 \
--l2_decay=1e-4 \
--use_label_smoothing=True \
--label_smoothing_epsilon=0.1
......@@ -95,13 +95,16 @@ For more pretrained model selection, please refer to [PretrainedModels](./paddle
- [Models API](./docs/models.md)
## Tutorials
Please refer to our official AI Studio account for more interactive tutorials: [PaddleNLP on AI Studio](https://aistudio.baidu.com/aistudio/personalcenter/thirdview/574995)
* [What's Seq2Vec?](https://aistudio.baidu.com/aistudio/projectdetail/1294333) shows how to use LSTM to do sentiment analysis.
* [What's Seq2Vec?](https://aistudio.baidu.com/aistudio/projectdetail/1283423) shows how to use LSTM to do sentiment analysis.
* [Sentiment Analysis with ERNIE](https://aistudio.baidu.com/aistudio/projectdetail/1283423) shows how to exploit the pretrained ERNIE to make sentiment analysis better.
* [Sentiment Analysis with ERNIE](https://aistudio.baidu.com/aistudio/projectdetail/1294333) shows how to exploit the pretrained ERNIE to make sentiment analysis better.
* [Waybill Information Extraction with BiGRU-CRF Model](https://aistudio.baidu.com/aistudio/projectdetail/1317771) shows how to make use of bigru and crf to do information extraction.
......
# BERT Benchmark with Fleet API
BERT - Bidirectional Encoder Representations from Transformers [论文链接](https://arxiv.org/abs/1810.04805)
PaddlePaddle实现了BERT的预训练模型(Pre-training)和下游任务(Fine-tunning)。在预训练任务上提供单机版本和多机版本,同时提供混合精度接口来进行加速,可以任务需要进行选择。
PaddlePaddle实现了BERT的预训练模型(Pre-training)和下游任务(Fine-tunning)。
## 数据集
### Pre-training数据集
......@@ -10,7 +10,8 @@ PaddlePaddle实现了BERT的预训练模型(Pre-training)和下游任务(Fin
## Pre-training任务训练
### 环境变量设置
1. paddlenlp的安装
pip install paddlenlp==2.0.0a2 -i https://pypi.org/simple
pip install paddlenlp==2.0.0b0 -i https://pypi.org/simple
2. 设置预训练的数据地址环境变量
```shell
export DATA_DIR=${HOME}/bert_data/wikicorpus_en
......@@ -54,26 +55,6 @@ python ./run_pretrain_single.py \
--max_steps 1000000
```
### 训练速度对比
进行速度对比的模型是bert-based模型,主要对比的方式是单机单机和多机多卡(4机32卡)下面进行速度对比,所有的GPU测试配置都是基于 Tesla V100-SXM2-16GB,下面的配置如下:
- InfiniBand 100 Gb/sec (4X EDR), Mellanox Technologies MT27700 Family
- 48 CPU(s), Intel(R) Xeon(R) Gold 5118 CPU @ 2.30GHz
- Memory 500G
- Ubuntu 16.04.4 LTS (GNU/Linux 4.4.0-116-generic x86_64)
- CUDA Version: 10.2, Driver API Version: 10.2, Driver Version: 440.33.01
- cuDNN Version: 7.6
- PaddlePaddle version: paddlepadle-gpu >= 2.0.0rc1
- PaddleNLP version: paddlenlp >= 2.0.0a2
速度统计方式是统计每秒预训练模型能处理的样本数量,其中
- batch_size=64
- max_seq_length=128
下面是具体速度对比情况:
| node num | node num | gpu num/node | gpu num | batch_size/gpu |Throughput | Speedup |
|----------| -------- | -------------| ------- | -------- | ----------| ------- |
## Fine-tuning任务训练
在完成 BERT 模型的预训练后,即可利用预训练参数在特定的 NLP 任务上做 Fine-tuning。以下利用开源的预训练模型,示例如何进行分类任务的 Fine-tuning。
......
......@@ -172,13 +172,27 @@ def reset_program_state_dict(model, state_dict):
loc=0.0, scale=scale, size=p.shape).astype(dtype_str)
return new_state_dict
def build_compiled_program(main_program, loss):
def create_strategy():
"""
Create build strategy and exec strategy.
Args:
Returns:
build_strategy: build strategy
exec_strategy: exec strategy
"""
build_strategy = paddle.static.BuildStrategy()
exec_strategy = paddle.static.ExecutionStrategy()
build_strategy.enable_addto = args.enable_addto
exec_strategy.num_threads = 1
exec_strategy.num_iteration_per_drop_scope = 10000
build_strategy = paddle.static.BuildStrategy()
build_strategy.enable_addto = args.enable_addto
return build_strategy, exec_strategy
def build_compiled_program(main_program, loss):
build_strategy, exec_strategy = create_strategy()
main_program = paddle.static.CompiledProgram(
main_program).with_data_parallel(
loss_name=loss.name,
......@@ -187,6 +201,33 @@ def build_compiled_program(main_program, loss):
return main_program
def dist_optimizer(args, optimizer):
"""
Create a distributed optimizer based on a normal optimizer
Args:
args:
optimizer: a normal optimizer
Returns:
optimizer: a distributed optimizer
"""
build_strategy, exec_strategy = create_strategy()
dist_strategy = fleet.DistributedStrategy()
dist_strategy.execution_strategy = exec_strategy
dist_strategy.build_strategy = build_strategy
dist_strategy.fuse_grad_size_in_MB = 16
if args.use_amp:
dist_strategy.amp = True
dist_strategy.amp_configs = {
'custom_white_list': ['softmax', 'layer_norm', 'gelu'],
'init_loss_scaling': args.scale_loss,
}
optimizer = fleet.distributed_optimizer(optimizer, strategy=dist_strategy)
return optimizer
def set_seed(seed):
random.seed(seed)
np.random.seed(seed)
......@@ -208,9 +249,12 @@ def do_train(args):
place = paddle.set_device(args.select_device)
fleet.init(is_collective=True)
worker_num = fleet.worker_num()
worker_index = fleet.worker_index()
# Create the random seed for the worker
set_seed(args.seed)
worker_init = WorkerInitObj(args.seed + fleet.worker_index())
worker_init = WorkerInitObj(args.seed + worker_index)
# Define the input data in the static mode
main_program = paddle.static.default_main_program()
......@@ -260,7 +304,7 @@ def do_train(args):
p.name for n, p in model.named_parameters()
if not any(nd in n for nd in ["bias", "norm"])
])
if args.use_amp:
if worker_num == 1 and args.use_amp:
amp_list = paddle.fluid.contrib.mixed_precision.AutoMixedPrecisionLists(
custom_white_list=['softmax', 'layer_norm', 'gelu'])
optimizer = paddle.fluid.contrib.mixed_precision.decorate(
......@@ -268,9 +312,10 @@ def do_train(args):
amp_list,
init_loss_scaling=args.scale_loss,
use_dynamic_loss_scaling=True)
# Use the fleet api to compile the distributed optimizer
strategy = fleet.DistributedStrategy()
optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
if worker_num > 1:
# Use the fleet api to compile the distributed optimizer
optimizer = dist_optimizer(args, optimizer)
optimizer.minimize(loss)
# Define the Executor for running the static model
......@@ -281,14 +326,14 @@ def do_train(args):
# Use the state dict to update the parameter
reset_state_dict = reset_program_state_dict(model, state_dict)
paddle.static.set_program_state(main_program, reset_state_dict)
# Construct the compiled program
main_program = build_compiled_program(main_program, loss)
if worker_num == 1:
# Construct the compiled program
main_program = build_compiled_program(main_program, loss)
pool = ThreadPoolExecutor(1)
global_step = 0
tic_train = time.time()
worker_num = fleet.worker_num()
worker_index = fleet.worker_index()
epoch = 0
while True:
files = [
......
......@@ -27,6 +27,12 @@ pool_size: 200000
sort_type: "global"
batch_size: 4096
infer_batch_size: 16
shuffle_batch: True
# Data shuffle only works when sort_type is pool or none
shuffle: True
# shuffle_seed must be set when shuffle is True and using multi-cards to train.
# Otherwise, the number of batches cannot be guaranteed.
shuffle_seed: 128
# Hyparams for training:
# The number of epoches for training
......
......@@ -43,6 +43,12 @@ def create_data_loader(args):
mode=m, transform_func=transform_func) for m in ["train", "dev"]
]
if args.shuffle or args.shuffle_batch:
if args.shuffle_seed == "None" or args.shuffle_seed is None:
shuffle_seed = 0
else:
shuffle_seed = args.shuffle_seed
def _max_token_fn(current_idx, current_batch_size, tokens_sofar,
data_source):
return max(tokens_sofar,
......@@ -60,19 +66,17 @@ def create_data_loader(args):
min_max_filer, max_len=args.max_length))
sampler = SamplerHelper(dataset)
src_key = (lambda x, data_source: len(data_source[x][0]) + 1)
if args.sort_type == SortType.GLOBAL:
buffer_size = -1
src_key = (lambda x, data_source: len(data_source[x][0]) + 1)
trg_key = (lambda x, data_source: len(data_source[x][1]) + 1)
# Sort twice
sampler = sampler.sort(
key=trg_key, buffer_size=buffer_size).sort(
key=src_key, buffer_size=buffer_size)
sampler = sampler.sort(key=trg_key).sort(key=src_key)
else:
sampler = sampler.shuffle()
if args.shuffle:
sampler = sampler.shuffle(seed=shuffle_seed)
max_key = (lambda x, data_source: max(len(data_source[x][0]), len(data_source[x][1])) + 1)
if args.sort_type == SortType.POOL:
buffer_size = args.pool_size
sampler = sampler.sort(key=src_key, buffer_size=buffer_size)
sampler = sampler.sort(key=max_key, buffer_size=args.pool_size)
batch_sampler = sampler.batch(
batch_size=args.batch_size,
......@@ -80,6 +84,9 @@ def create_data_loader(args):
batch_size_fn=_max_token_fn,
key=_key)
if args.shuffle_batch:
batch_sampler.shuffle(seed=shuffle_seed)
if m == "train":
batch_sampler = batch_sampler.shard()
......
- [Embedding 模型汇总](#embedding-模型汇总)
- [中文词向量](#中文词向量)
- [英文词向量](#英文词向量)
- [GloVe](#glove)
- [FastText](#fasttext)
- [模型信息](#模型信息)
- [致谢](#致谢)
- [参考论文](#参考论文)
# Embedding 模型汇总
PaddleNLP提供多个开源的预训练Embedding模型,用户仅需在使用`paddlenlp.embeddings.TokenEmbedding`时,指定预训练模型的名称,即可加载相对应的预训练模型。以下为PaddleNLP所支持的预训练Embedding模型,其名称用作`paddlenlp.embeddings.TokenEmbedding`的参数。命名方式为:\${训练模型}.\${语料}.\${词向量类型}.\${co-occurrence type}.dim\${维度}。训练模型有三种,分别是Word2Vec(w2v, 使用skip-gram模型训练), GloVe(glove)和FastText(fasttext)。
......@@ -42,11 +51,91 @@ PaddleNLP提供多个开源的预训练Embedding模型,用户仅需在使用`p
## 英文词向量
待更新。
### GloVe
| 语料 | 25维 | 50维 | 100维 | 200维 | 300 维 |
| ----------------- | ------ | ------ | ------ | ------ | ------ |
| Wiki2014 + GigaWord | 无 | glove.wiki2014-gigaword.target.word-word.dim50.en | glove.wiki2014-gigaword.target.word-word.dim100.en | glove.wiki2014-gigaword.target.word-word.dim200.en | glove.wiki2014-gigaword.target.word-word.dim300.en |
| Twitter | glove.twitter.target.word-word.dim25.en | glove.twitter.target.word-word.dim50.en | glove.twitter.target.word-word.dim100.en | glove.twitter.target.word-word.dim200.en | 无 |
### FastText
| 语料 | 名称 |
|------|------|
| Wiki2017 | fasttext.wiki-news.target.word-word.dim300.en |
| Crawl | fasttext.crawl.target.word-word.dim300.en |
## 模型信息
| 模型 | 文件大小 | 词表大小 |
|-----|---------|---------|
| w2v.baidu_encyclopedia.target.word-word.dim300 | 678.21 MB | 635965 |
| w2v.baidu_encyclopedia.target.word-character.char1-1.dim300 | 679.15 MB | 636038 |
| w2v.baidu_encyclopedia.target.word-character.char1-2.dim300 | 679.30 MB | 636038 |
| w2v.baidu_encyclopedia.target.word-character.char1-4.dim300 | 679.51 MB | 636038 |
| w2v.baidu_encyclopedia.target.word-ngram.1-2.dim300 | 679.48 MB | 635977 |
| w2v.baidu_encyclopedia.target.word-ngram.1-3.dim300 | 671.27 MB | 628669 |
| w2v.baidu_encyclopedia.target.word-ngram.2-2.dim300 | 7.28 GB | 6969069 |
| w2v.baidu_encyclopedia.target.word-wordLR.dim300 | 678.22 MB | 635958 |
| w2v.baidu_encyclopedia.target.word-wordPosition.dim300 | 679.32 MB | 636038 |
| w2v.baidu_encyclopedia.target.bigram-char.dim300 | 679.29 MB | 635976 |
| w2v.baidu_encyclopedia.context.word-word.dim300 | 677.74 MB | 635952 |
| w2v.baidu_encyclopedia.context.word-character.char1-1.dim300 | 678.65 MB | 636200 |
| w2v.baidu_encyclopedia.context.word-character.char1-2.dim300 | 844.23 MB | 792631 |
| w2v.baidu_encyclopedia.context.word-character.char1-4.dim300 | 1.16 GB | 1117461 |
| w2v.baidu_encyclopedia.context.word-ngram.1-2.dim300 | 7.25 GB | 6967598 |
| w2v.baidu_encyclopedia.context.word-ngram.1-3.dim300 | 5.21 GB | 5000001 |
| w2v.baidu_encyclopedia.context.word-ngram.2-2.dim300 | 7.26 GB | 6968998 |
| w2v.baidu_encyclopedia.context.word-wordLR.dim300 | 1.32 GB | 1271031 |
| w2v.baidu_encyclopedia.context.word-wordPosition.dim300 | 6.47 GB | 6293920 |
| w2v.wiki.target.bigram-char.dim300 | 375.98 MB | 352274 |
| w2v.wiki.target.word-char.dim300 | 375.52 MB | 352223 |
| w2v.wiki.target.word-word.dim300 | 374.95 MB | 352219 |
| w2v.wiki.target.word-bigram.dim300 | 375.72 MB | 352219 |
| w2v.people_daily.target.bigram-char.dim300 | 379.96 MB | 356055 |
| w2v.people_daily.target.word-char.dim300 | 379.45 MB | 355998 |
| w2v.people_daily.target.word-word.dim300 | 378.93 MB | 355989 |
| w2v.people_daily.target.word-bigram.dim300 | 379.68 MB | 355991 |
| w2v.weibo.target.bigram-char.dim300 | 208.24 MB | 195199 |
| w2v.weibo.target.word-char.dim300 | 208.03 MB | 195204 |
| w2v.weibo.target.word-word.dim300 | 207.94 MB | 195204 |
| w2v.weibo.target.word-bigram.dim300 | 208.19 MB | 195204 |
| w2v.sogou.target.bigram-char.dim300 | 389.81 MB | 365112 |
| w2v.sogou.target.word-char.dim300 | 389.89 MB | 365078 |
| w2v.sogou.target.word-word.dim300 | 388.66 MB | 364992 |
| w2v.sogou.target.word-bigram.dim300 | 388.66 MB | 364994 |
| w2v.zhihu.target.bigram-char.dim300 | 277.35 MB | 259755 |
| w2v.zhihu.target.word-char.dim300 | 277.40 MB | 259940 |
| w2v.zhihu.target.word-word.dim300 | 276.98 MB | 259871 |
| w2v.zhihu.target.word-bigram.dim300 | 277.53 MB | 259885 |
| w2v.financial.target.bigram-char.dim300 | 499.52 MB | 467163 |
| w2v.financial.target.word-char.dim300 | 499.17 MB | 467343 |
| w2v.financial.target.word-word.dim300 | 498.94 MB | 467324 |
| w2v.financial.target.word-bigram.dim300 | 499.54 MB | 467331 |
| w2v.literature.target.bigram-char.dim300 | 200.69 MB | 187975 |
| w2v.literature.target.word-char.dim300 | 200.44 MB | 187980 |
| w2v.literature.target.word-word.dim300 | 200.28 MB | 187961 |
| w2v.literature.target.word-bigram.dim300 | 200.59 MB | 187962 |
| w2v.sikuquanshu.target.word-word.dim300 | 20.70 MB | 19529 |
| w2v.sikuquanshu.target.word-bigram.dim300 | 20.77 MB | 19529 |
| w2v.mixed-large.target.word-char.dim300 | 1.35 GB | 1292552 |
| w2v.mixed-large.target.word-word.dim300 | 1.35 GB | 1292483 |
| glove.wiki2014-gigaword.target.word-word.dim50.en | 73.45 MB | 400002 |
| glove.wiki2014-gigaword.target.word-word.dim100.en | 143.30 MB | 400002 |
| glove.wiki2014-gigaword.target.word-word.dim200.en | 282.97 MB | 400002 |
| glove.wiki2014-gigaword.target.word-word.dim300.en | 422.83 MB | 400002 |
| glove.twitter.target.word-word.dim25.en | 116.92 MB | 1193516 |
| glove.twitter.target.word-word.dim50.en | 221.64 MB | 1193516 |
| glove.twitter.target.word-word.dim100.en | 431.08 MB | 1193516 |
| glove.twitter.target.word-word.dim200.en | 848.56 MB | 1193516 |
| fasttext.wiki-news.target.word-word.dim300.en | 541.63 MB | 999996 |
| fasttext.crawl.target.word-word.dim300.en | 1.19 GB | 2000002 |
## 致谢
- 感谢 [Chinese-Word-Vectors](https://github.com/Embedding/Chinese-Word-Vectors)提供Word2Vec中文Embedding来源
- 感谢 [Chinese-Word-Vectors](https://github.com/Embedding/Chinese-Word-Vectors)提供Word2Vec中文Embedding预训练模型,[GloVe Project](https://nlp.stanford.edu/projects/glove)提供的GloVe英文Embedding预训练模型,[FastText Project](https://fasttext.cc/docs/en/english-vectors.html)提供的fasttext英文预训练模型
## 参考论文
- Li, Shen, et al. "Analogical reasoning on chinese morphological and semantic relations." arXiv preprint arXiv:1805.06504 (2018).
- Qiu, Yuanyuan, et al. "Revisiting correlations between intrinsic and extrinsic evaluations of word embeddings." Chinese Computational Linguistics and Natural Language Processing Based on Naturally Annotated Big Data. Springer, Cham, 2018. 209-221.
- Jeffrey Pennington, Richard Socher, and Christopher D. Manning. 2014. GloVe: Global Vectors for Word Representation.
- T. Mikolov, E. Grave, P. Bojanowski, C. Puhrsch, A. Joulin. Advances in Pre-Training Distributed Word Representations
此差异已折叠。
......@@ -39,17 +39,17 @@ DGU模型中的6个任务,分别采用不同的评估指标在test集上进行
* PaddlePaddle 安装
本项目依赖于 PaddlePaddle 2.0 及以上版本,请参考 [安装指南](http://www.paddlepaddle.org/#quick-start) 进行安装
本项目依赖于 PaddlePaddle 2.0rc1 及以上版本,请参考 [安装指南](http://www.paddlepaddle.org/#quick-start) 进行安装
* PaddleNLP 安装
```shell
pip install paddlenlp
pip install paddlenlp>=2.0.0b
```
* 环境依赖
Python的版本要求 3.6+,其它环境请参考 PaddlePaddle [安装说明](https://www.paddlepaddle.org.cn/install/quick/zh/2.0rc-linux-docker) 部分的内容
Python的版本要求 3.6+
### 代码结构说明
......
......@@ -18,7 +18,7 @@ PLATO-2的训练过程及其他细节详见 [Knover](https://github.com/PaddlePa
* PaddlePaddle 安装
本项目依赖于 PaddlePaddle 2.0 及以上版本,请参考 [安装指南](http://www.paddlepaddle.org/#quick-start) 进行安装
本项目依赖于 PaddlePaddle 2.0rc1 及以上版本,请参考 [安装指南](http://www.paddlepaddle.org/#quick-start) 进行安装
* PaddleNLP 安装
......@@ -28,13 +28,13 @@ PLATO-2的训练过程及其他细节详见 [Knover](https://github.com/PaddlePa
* 环境依赖
Python的版本要求 3.6+
Python的版本要求 3.6+
本项目依赖sentencepiece和termcolor,请在运行本项目之前进行安装
本项目依赖sentencepiece和termcolor,请在运行本项目之前进行安装
```shell
pip install sentencepiece termcolor
```
```shell
pip install sentencepiece termcolor
```
### 代码结构说明
......
# BERT
## 模型简介
[BERT](https://arxiv.org/abs/1810.04805) (Bidirectional Encoder Representations from Transformers)以[Transformer](https://arxiv.org/abs/1706.03762) 编码器为网络基本组件,使用掩码语言模型(Masked Language Model)和邻接句子预测(Next Sentence Prediction)两个任务在大规模无标注文本语料上进行预训练(pre-train),得到融合了双向内容的通用语义表示模型。以预训练产生的通用语义表示模型为基础,结合任务适配的简单输出层,微调(fine-tune)后即可应用到下游的NLP任务,效果通常也较直接在下游的任务上训练的模型更优。此前BERT即在[GLUE评测任务](https://gluebenchmark.com/tasks)上取得了SOTA的结果。
本项目是BERT在 Paddle 2.0上的开源实现,包含了预训练和[GLUE评测任务](https://gluebenchmark.com/tasks)上的微调代码。
## 快速开始
### 安装说明
* PaddlePaddle 安装
本项目依赖于 PaddlePaddle 2.0rc1 及以上版本,请参考 [安装指南](http://www.paddlepaddle.org/#quick-start) 进行安装
* PaddleNLP 安装
```shell
pip install paddlenlp>=2.0.0b
```
### 数据准备
#### Pre-training数据准备
`create_pretraining_data.py` 是创建预训练程序所需数据的脚本。其以文本文件(使用换行符换行和空白符分隔,data目录下提供了部分示例数据)为输入,经由BERT tokenizer进行tokenize后再做生成sentence pair正负样本、掩码token等处理,最后输出hdf5格式的数据文件。使用方式如下:
```python
python create_pretraining_data.py \
--input_file=data/sample_text.txt \
--output_file=data/training_data.hdf5 \
--bert_model=bert-base-uncased \
--max_seq_length=128 \
--max_predictions_per_seq=20 \
--masked_lm_prob=0.15 \
--random_seed=12345 \
--dupe_factor=5
```
其中参数释义如下:
- `input_file` 指定输入文件,可以使用目录,指定目录时将包括目录中的所有`.txt`文件。
- `output_file` 指定输出文件。
- `bert_model` 指定使用特定BERT模型对应的tokenizer进行tokenize处理。
- `max_seq_length` 指定最大句子长度,超过该长度将被截断,不足该长度的将会进行padding。
- `max_predictions_per_seq` 表示每个句子中会被mask的token的最大数目。
- `masked_lm_prob` 表示每个token被mask的概率。
- `random_seed` 指定随机种子。
- `dupe_factor` 指定输入数据被重复处理的次数,每次处理将重新产生随机mask。
使用以上预训练数据生成程序可以用于处理领域垂类数据后进行二次预训练。若需要使用BERT论文中预训练使用的英文Wiki和BookCorpus数据,可以参考[这里](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/LanguageModeling/BERT)进行处理,得到的数据可以直接接入本项目中的预训练程序使用。
#### Fine-tunning数据准备
##### GLUE评测任务数据
GLUE评测任务所含数据集已在paddlenlp中以API形式提供,无需预先准备,使用`run_glue.py`执行微调时将会自动下载。
### 执行Pre-training
```shell
python -u ./run_pretrain.py \
--model_type bert \
--model_name_or_path bert-base-uncased \
--max_predictions_per_seq 20 \
--batch_size 32 \
--learning_rate 1e-4 \
--weight_decay 1e-2 \
--adam_epsilon 1e-6 \
--warmup_steps 10000 \
--num_train_epochs 3 \
--input_dir data/ \
--output_dir pretrained_models/ \
--logging_steps 1 \
--save_steps 20000 \
--max_steps 1000000 \
--n_gpu 1
```
其中参数释义如下:
- `model_type` 指示了模型类型,使用BERT模型时设置为bert即可。
- `model_name_or_path` 指示了某种特定配置的模型,对应有其预训练模型和预训练时使用的 tokenizer。若模型相关内容保存在本地,这里也可以提供相应目录地址。
- `max_predictions_per_seq` 表示每个句子中会被mask的token的最大数目,与创建预训练数据时的设置一致。
- `batch_size` 表示每次迭代**每张卡**上的样本数目。
- `learning_rate` 表示基础学习率大小,将于learning rate scheduler产生的值相乘作为当前学习率。
- `weight_decay` 表示AdamW优化器中使用的weight_decay的系数。
- `adam_epsilon` 表示AdamW优化器中使用的epsilon值。
- `warmup_steps` 表示。
- `num_train_epochs` 表示训练轮数。
- `input_dir` 表示输入数据的目录,该目录下所有文件名中包含training的文件将被作为训练数据。
- `output_dir` 表示模型的保存目录。
- `logging_steps` 表示日志打印间隔。
- `save_steps` 表示模型保存及评估间隔。
- `max_steps` 表示最大训练步数。若训练`num_train_epochs`轮包含的训练步数大于该值,则达到`max_steps`后就提前结束。
- `n_gpu` 表示使用的 GPU 卡数。若希望使用多卡训练,将其设置为指定数目即可;若为0,则使用CPU。
### 执行Fine-tunning
以GLUE中的SST-2任务为例,启动Fine-tuning的方式如下:
```shell
python -u ./run_glue.py \
--model_type bert \
--model_name_or_path bert-base-uncased \
--task_name SST-2 \
--max_seq_length 128 \
--batch_size 32 \
--learning_rate 2e-5 \
--num_train_epochs 3 \
--logging_steps 1 \
--save_steps 500 \
--output_dir ./tmp/ \
--n_gpu 1 \
```
其中参数释义如下:
- `model_type` 指示了模型类型,使用BERT模型时设置为bert即可。
- `model_name_or_path` 指示了某种特定配置的模型,对应有其预训练模型和预训练时使用的 tokenizer。若模型相关内容保存在本地,这里也可以提供相应目录地址。
- `task_name` 表示Fine-tuning的任务。
- `max_seq_length` 表示最大句子长度,超过该长度将被截断。
- `batch_size` 表示每次迭代**每张卡**上的样本数目。
- `learning_rate` 表示基础学习率大小,将于learning rate scheduler产生的值相乘作为当前学习率。
- `num_train_epochs` 表示训练轮数。
- `logging_steps` 表示日志打印间隔。
- `save_steps` 表示模型保存及评估间隔。
- `output_dir` 表示模型保存路径。
- `n_gpu` 表示使用的 GPU 卡数。若希望使用多卡训练,将其设置为指定数目即可;若为0,则使用CPU。
基于`bert-base-uncased`在GLUE各评测任务上Fine-tuning后,在验证集上有如下结果:
| Task | Metric | Result |
|:-----:|:----------------------------:|:-----------------:|
| SST-2 | Accuracy | 0.92660 |
| QNLI | Accuracy | 0.91707 |
| CoLA | Mattehew's corr | 0.59557 |
| MRPC | F1/Accuracy | 0.91667/0.88235 |
| STS-B | Person/Spearman corr | 0.88847/0.88350 |
| QQP | Accuracy/F1 | 0.90581/0.87347 |
| MNLI | Matched acc/MisMatched acc | 0.84422/0.84825 |
| RTE | Accuracy | 0.711191 |
# coding=utf-8
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Create masked LM/next sentence masked_lm TF examples for BERT."""
from __future__ import absolute_import, division, print_function, unicode_literals
import argparse
import logging
import os
import random
from io import open
import h5py
import numpy as np
from tqdm import tqdm
from paddlenlp.transformers import BertTokenizer
from paddlenlp.transformers.tokenizer_utils import convert_to_unicode
import random
import collections
class TrainingInstance(object):
"""A single training instance (sentence pair)."""
def __init__(self, tokens, segment_ids, masked_lm_positions,
masked_lm_labels, is_random_next):
self.tokens = tokens
self.segment_ids = segment_ids
self.is_random_next = is_random_next
self.masked_lm_positions = masked_lm_positions
self.masked_lm_labels = masked_lm_labels
def write_instance_to_example_file(instances, tokenizer, max_seq_length,
max_predictions_per_seq, output_file):
"""Create TF example files from `TrainingInstance`s."""
total_written = 0
features = collections.OrderedDict()
num_instances = len(instances)
features["input_ids"] = np.zeros([num_instances, max_seq_length],
dtype="int32")
features["input_mask"] = np.zeros([num_instances, max_seq_length],
dtype="int32")
features["segment_ids"] = np.zeros([num_instances, max_seq_length],
dtype="int32")
features["masked_lm_positions"] = np.zeros(
[num_instances, max_predictions_per_seq], dtype="int32")
features["masked_lm_ids"] = np.zeros(
[num_instances, max_predictions_per_seq], dtype="int32")
features["next_sentence_labels"] = np.zeros(num_instances, dtype="int32")
for inst_index, instance in enumerate(tqdm(instances)):
input_ids = tokenizer.convert_tokens_to_ids(instance.tokens)
input_mask = [1] * len(input_ids)
segment_ids = list(instance.segment_ids)
assert len(input_ids) <= max_seq_length
while len(input_ids) < max_seq_length:
input_ids.append(0)
input_mask.append(0)
segment_ids.append(0)
assert len(input_ids) == max_seq_length
assert len(input_mask) == max_seq_length
assert len(segment_ids) == max_seq_length
masked_lm_positions = list(instance.masked_lm_positions)
masked_lm_ids = tokenizer.convert_tokens_to_ids(
instance.masked_lm_labels)
masked_lm_weights = [1.0] * len(masked_lm_ids)
while len(masked_lm_positions) < max_predictions_per_seq:
masked_lm_positions.append(0)
masked_lm_ids.append(0)
masked_lm_weights.append(0.0)
next_sentence_label = 1 if instance.is_random_next else 0
features["input_ids"][inst_index] = input_ids
features["input_mask"][inst_index] = input_mask
features["segment_ids"][inst_index] = segment_ids
features["masked_lm_positions"][inst_index] = masked_lm_positions
features["masked_lm_ids"][inst_index] = masked_lm_ids
features["next_sentence_labels"][inst_index] = next_sentence_label
total_written += 1
print("saving data")
f = h5py.File(output_file, 'w')
f.create_dataset("input_ids",
data=features["input_ids"],
dtype='i4',
compression='gzip')
f.create_dataset("input_mask",
data=features["input_mask"],
dtype='i1',
compression='gzip')
f.create_dataset("segment_ids",
data=features["segment_ids"],
dtype='i1',
compression='gzip')
f.create_dataset("masked_lm_positions",
data=features["masked_lm_positions"],
dtype='i4',
compression='gzip')
f.create_dataset("masked_lm_ids",
data=features["masked_lm_ids"],
dtype='i4',
compression='gzip')
f.create_dataset("next_sentence_labels",
data=features["next_sentence_labels"],
dtype='i1',
compression='gzip')
f.flush()
f.close()
def create_training_instances(input_files, tokenizer, max_seq_length,
dupe_factor, short_seq_prob, masked_lm_prob,
max_predictions_per_seq, rng):
"""Create `TrainingInstance`s from raw text."""
all_documents = [[]]
# Input file format:
# (1) One sentence per line. These should ideally be actual sentences, not
# entire paragraphs or arbitrary spans of text. (Because we use the
# sentence boundaries for the "next sentence prediction" task).
# (2) Blank lines between documents. Document boundaries are needed so
# that the "next sentence prediction" task doesn't span between documents.
for input_file in input_files:
print("creating instance from {}".format(input_file))
with open(input_file, "r") as reader:
while True:
line = convert_to_unicode(reader.readline())
if not line:
break
line = line.strip()
# Empty lines are used as document delimiters
if not line:
all_documents.append([])
# tokens = tokenizer.tokenize(line)
tokens = tokenizer(line)
if tokens:
all_documents[-1].append(tokens)
# Remove empty documents
all_documents = [x for x in all_documents if x]
rng.shuffle(all_documents)
# vocab_words = list(tokenizer.vocab.keys())
vocab_words = list(tokenizer.vocab.token_to_idx.keys())
instances = []
for _ in range(dupe_factor):
for document_index in range(len(all_documents)):
instances.extend(
create_instances_from_document(all_documents, document_index,
max_seq_length, short_seq_prob,
masked_lm_prob,
max_predictions_per_seq,
vocab_words, rng))
rng.shuffle(instances)
return instances
def create_instances_from_document(all_documents, document_index,
max_seq_length, short_seq_prob,
masked_lm_prob, max_predictions_per_seq,
vocab_words, rng):
"""Creates `TrainingInstance`s for a single document."""
document = all_documents[document_index]
# Account for [CLS], [SEP], [SEP]
max_num_tokens = max_seq_length - 3
# We *usually* want to fill up the entire sequence since we are padding
# to `max_seq_length` anyways, so short sequences are generally wasted
# computation. However, we *sometimes*
# (i.e., short_seq_prob == 0.1 == 10% of the time) want to use shorter
# sequences to minimize the mismatch between pre-training and fine-tuning.
# The `target_seq_length` is just a rough target however, whereas
# `max_seq_length` is a hard limit.
target_seq_length = max_num_tokens
if rng.random() < short_seq_prob:
target_seq_length = rng.randint(2, max_num_tokens)
# We DON'T just concatenate all of the tokens from a document into a long
# sequence and choose an arbitrary split point because this would make the
# next sentence prediction task too easy. Instead, we split the input into
# segments "A" and "B" based on the actual "sentences" provided by the user
# input.
instances = []
current_chunk = []
current_length = 0
i = 0
while i < len(document):
segment = document[i]
current_chunk.append(segment)
current_length += len(segment)
if i == len(document) - 1 or current_length >= target_seq_length:
if current_chunk:
# `a_end` is how many segments from `current_chunk` go into the `A`
# (first) sentence.
a_end = 1
if len(current_chunk) >= 2:
a_end = rng.randint(1, len(current_chunk) - 1)
tokens_a = []
for j in range(a_end):
tokens_a.extend(current_chunk[j])
tokens_b = []
# Random next
is_random_next = False
if len(current_chunk) == 1 or rng.random() < 0.5:
is_random_next = True
target_b_length = target_seq_length - len(tokens_a)
# This should rarely go for more than one iteration for large
# corpora. However, just to be careful, we try to make sure that
# the random document is not the same as the document
# we're processing.
for _ in range(10):
random_document_index = rng.randint(
0,
len(all_documents) - 1)
if random_document_index != document_index:
break
#If picked random document is the same as the current document
if random_document_index == document_index:
is_random_next = False
random_document = all_documents[random_document_index]
random_start = rng.randint(0, len(random_document) - 1)
for j in range(random_start, len(random_document)):
tokens_b.extend(random_document[j])
if len(tokens_b) >= target_b_length:
break
# We didn't actually use these segments so we "put them back" so
# they don't go to waste.
num_unused_segments = len(current_chunk) - a_end
i -= num_unused_segments
# Actual next
else:
is_random_next = False
for j in range(a_end, len(current_chunk)):
tokens_b.extend(current_chunk[j])
truncate_seq_pair(tokens_a, tokens_b, max_num_tokens, rng)
assert len(tokens_a) >= 1
assert len(tokens_b) >= 1
tokens = []
segment_ids = []
tokens.append("[CLS]")
segment_ids.append(0)
for token in tokens_a:
tokens.append(token)
segment_ids.append(0)
tokens.append("[SEP]")
segment_ids.append(0)
for token in tokens_b:
tokens.append(token)
segment_ids.append(1)
tokens.append("[SEP]")
segment_ids.append(1)
(tokens, masked_lm_positions,
masked_lm_labels) = create_masked_lm_predictions(
tokens, masked_lm_prob, max_predictions_per_seq,
vocab_words, rng)
instance = TrainingInstance(
tokens=tokens,
segment_ids=segment_ids,
is_random_next=is_random_next,
masked_lm_positions=masked_lm_positions,
masked_lm_labels=masked_lm_labels)
instances.append(instance)
current_chunk = []
current_length = 0
i += 1
return instances
MaskedLmInstance = collections.namedtuple("MaskedLmInstance",
["index", "label"])
def create_masked_lm_predictions(tokens, masked_lm_prob,
max_predictions_per_seq, vocab_words, rng):
"""Creates the predictions for the masked LM objective."""
cand_indexes = []
for (i, token) in enumerate(tokens):
if token == "[CLS]" or token == "[SEP]":
continue
cand_indexes.append(i)
rng.shuffle(cand_indexes)
output_tokens = list(tokens)
num_to_predict = min(max_predictions_per_seq,
max(1, int(round(len(tokens) * masked_lm_prob))))
masked_lms = []
covered_indexes = set()
for index in cand_indexes:
if len(masked_lms) >= num_to_predict:
break
if index in covered_indexes:
continue
covered_indexes.add(index)
masked_token = None
# 80% of the time, replace with [MASK]
if rng.random() < 0.8:
masked_token = "[MASK]"
else:
# 10% of the time, keep original
if rng.random() < 0.5:
masked_token = tokens[index]
# 10% of the time, replace with random word
else:
masked_token = vocab_words[rng.randint(0, len(vocab_words) - 1)]
output_tokens[index] = masked_token
masked_lms.append(MaskedLmInstance(index=index, label=tokens[index]))
masked_lms = sorted(masked_lms, key=lambda x: x.index)
masked_lm_positions = []
masked_lm_labels = []
for p in masked_lms:
masked_lm_positions.append(p.index)
masked_lm_labels.append(p.label)
return (output_tokens, masked_lm_positions, masked_lm_labels)
def truncate_seq_pair(tokens_a, tokens_b, max_num_tokens, rng):
"""Truncates a pair of sequences to a maximum sequence length."""
while True:
total_length = len(tokens_a) + len(tokens_b)
if total_length <= max_num_tokens:
break
trunc_tokens = tokens_a if len(tokens_a) > len(tokens_b) else tokens_b
assert len(trunc_tokens) >= 1
# We want to sometimes truncate from the front and sometimes from the
# back to add more randomness and avoid biases.
if rng.random() < 0.5:
del trunc_tokens[0]
else:
trunc_tokens.pop()
def main():
parser = argparse.ArgumentParser()
parser.add_argument(
"--input_file",
default=None,
type=str,
required=True,
help=
"The input train corpus. can be directory with .txt files or a path to a single file"
)
parser.add_argument(
"--output_file",
default=None,
type=str,
required=True,
help="The output file where created hdf5 formatted data will be written.")
parser.add_argument("--vocab_file",
default=None,
type=str,
required=False,
help="The vocabulary the BERT model will train on. "
"Use bert_model argument would ignore this. "
"The bert_model argument is recommended.")
parser.add_argument(
"--do_lower_case",
action='store_true',
default=True,
help=
"Whether to lower case the input text. True for uncased models, False for cased models. "
"Use bert_model argument would ignore this. The bert_model argument is recommended."
)
parser.add_argument(
"--bert_model",
default="bert-base-uncased",
type=str,
required=False,
help="Bert pre-trained model selected in the list: bert-base-uncased, "
"bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese."
"If provided, use the pre-trained model used tokenizer to create data "
"and ignore vocab_file and do_lower_case.")
## Other parameters
#int
parser.add_argument(
"--max_seq_length",
default=128,
type=int,
help=
"The maximum total input sequence length after WordPiece tokenization. \n"
"Sequences longer than this will be truncated, and sequences shorter \n"
"than this will be padded.")
parser.add_argument(
"--dupe_factor",
default=10,
type=int,
help=
"Number of times to duplicate the input data (with different masks).")
parser.add_argument(
"--max_predictions_per_seq",
default=20,
type=int,
help="Maximum number of masked LM predictions per sequence.")
# floats
parser.add_argument("--masked_lm_prob",
default=0.15,
type=float,
help="Masked LM probability.")
parser.add_argument(
"--short_seq_prob",
default=0.1,
type=float,
help=
"Probability to create a sequence shorter than maximum sequence length")
parser.add_argument('--random_seed',
type=int,
default=12345,
help="random seed for initialization")
args = parser.parse_args()
print(args)
if args.bert_model:
tokenizer = BertTokenizer.from_pretrained(args.bert_model)
else:
assert args.vocab_file, (
"vocab_file must be set If bert_model is not provided.")
tokenizer = BertTokenizer(args.vocab_file,
do_lower_case=args.do_lower_case)
input_files = []
if os.path.isfile(args.input_file):
input_files.append(args.input_file)
elif os.path.isdir(args.input_file):
input_files = [
os.path.join(args.input_file, f)
for f in os.listdir(args.input_file)
if (os.path.isfile(os.path.join(args.input_file, f))
and f.endswith('.txt'))
]
else:
raise ValueError("{} is not a valid path".format(args.input_file))
rng = random.Random(args.random_seed)
instances = create_training_instances(input_files, tokenizer,
args.max_seq_length, args.dupe_factor,
args.short_seq_prob,
args.masked_lm_prob,
args.max_predictions_per_seq, rng)
output_file = args.output_file
write_instance_to_example_file(instances, tokenizer, args.max_seq_length,
args.max_predictions_per_seq, output_file)
if __name__ == "__main__":
main()
\ No newline at end of file
Zulfiqar A. Bhutta trained as a physician in Pakistan in the early stages of his career.
He holds titles across various organizations in diverse geographies.
Professor Bhutta is the Founding Director of the Center of Excellence in Women and Child Health & Institute for Global Child Health & Development, at the Aga Khan University South-Central Asia, East Africa & United Kingdom.
He is currently the Co-Director at the Centre for Global Child Health, at the Hospital for Sick Children and leads many projects as a Senior Scientist at the Research Institute in the Centre for Global Child Health at Sick Kids.
He holds a Professorship at the University of Toronto in the Department of Nutritional Sciences and the Division of Epidemiology, Dalla Lana School of Public Health.
Additionally, he holds concurrent professorship at the Department of Paediatrics, Aga Khan University in Karachi, Pakistan and at the Schools of Public Health of Johns Hopkins University, Tufts University, Boston University, University of Alberta and the London School of Hygiene & Tropical Medicine.
He is a designated Distinguished National Professor of the Government of Pakistan and was the Founding Chair of the National Research Ethics Committee of the Government of Pakistan from 2003-2014.
Dr. Bhutta received his MBBS from Khyber Medical College in Peshawar, Pakistan in 1977 at which time he was names "Best Graduate of the Year" and awarded the University Gold Medal for overall distinction.
His PhD work was completed at Karolinska Institute in Stockholm, Sweden in 1996.
He is a Fellow of the Royal College of Physicians (Edinburgh & London), the Royal College of Paediatrics and Child Health (London), American Academy of Paediatrics and the Pakistan Academy of Sciences.
Following the completion of his PhD Dr. Bhutta began working as House Surgeon in Obstetrics & Gynecology at the Khyber Teaching Hospital, Peshawar (April-November 1978).
He began work in paediatrics as a physician in November of 1978 in the Professorial Unit at the Institute of Child Health, Jinnah Postgraduate Medical Centre, Karachi (Pakistan).
Through 1980's he continued his work as a surgeon and paediatrician.
He undertook his first professor position in the Department of Paediatrics, The Aga Khan University Hospital, Karachi (Pakistan), from November 1987 to June 1992.
In 2005, Dr. Bhutta became the Chairman of the Department of Paediatrics & Child Health at the Aga Khan University & Medical Center, a position held until 2008.
Following his term as Chairman he became The Noordin Noormahomed Sheriff Professor & Founding Chair, Division of Women & Child Health, The Aga Khan University, a position he held for four years.
Dr. Bhutta currently holds the titles of co-director of the Centre for Global Child Health at the Hospital for Sick Children in Toronto, and founding director of the Centre of Excellence in Women and Child Health at the Aga Khan University.
In 2020, he was appointed founding director of the Institute for Global child Health & Development at the Aga Khan University and elected Fellow to the Royal Society, United Kingdom.
Outside of his professional responsibilities Dr. Bhutta serves on various local and international boards and committees, including a series of editorial boards.
In his various capacities Dr. Bhutta has produced a large collection of publications working with his teams at Sick Kids, AKU and international partners.
These include book reviews, chapters, 1.
"Haematological disorders" "Neonatal Jaundice" in Neonatal Vade‑Mecum, Fleming PJ, Speidel BD, Dunn PM Eds, Lloyd‑Luke Publishers, UK, 1986.
Revised 2nd Edition 1991.
2.
"Nutritional management of acute and persistent diarrhoea".
A M Molla, Bhutta Z A and  A Molla.
In McNeish A S, Mittal S K and Walker-Smith J A (eds).
Recent trends in diarrhoea and malnutrition, MAMC, Delhi, 1991, pp 37-51.
3.
"Paediatric Prescribing” in "Text book of Paediatrics for developing countries"            Arif MA, Hanif SM, Wasti SMK Eds, 1989, 2nd Edition 1996,  PPA, Karachi.
& Lahore 4.
"Innovations in neonatal care : Impact on neonatal survival in the developing world:.
Bhutta Z A  Zaidi S (Editor) 1992.
TWEL Publisher.
Karachi pp 121-131 5.
"Short course therapy in Pediatrics" Bhutta Z A& Teele D.  In Tice A D, Waldvogel F (Eds), Contemporary issues in Infectious Disease Epidemiology and Management, 1993 Gardiner Caldwell, Cheshire, pp 52 - 60.
6.
"Dietary management of persistent diarrhoea".
Bhutta Z A, Molla A M, Issani Z.
In Reflections on  Diarrhoeal Disease & Nutrition  of Children".
1993 Karachi, pp 97 - 103.
7.
"Prescribing practices amongst general practitioners (GPs) and consultant paediatricians in childhood diarrhoea.”  S.Q.
Nizami, I.A.
Khan, Bhutta Z A.
In "Reflections on Diarrhoeal Disease and Nutrition of Children".
1993 Karachi, pp  88-90.
8.
"The challenge of multidrug-resistant typhoid".
Bhutta Z A.
In Puri R K, Sachdev H P S, Choudhry P, Verma I C (Eds), Current concepts in Paediatrics, 1994.
Jaypee Publishers, New Delhi, pp 403.8.
9.
"Perinatal Care in Pakistan: Current status and trends".
In Proceedings of the Workshop in Reproductive Health.
College of Physicians and Surgeons, Pakistan, Karachi, 1995, pp 95-103.
10.
“A study of whole body protein kinetics in malnourished children with persistent diarrhoea” Bhutta Z A, Nizami SQ, Isani Z, Hardy S, Hendricks K, Young V.   Report of the second RCM coordinated Research Programme for application of stable isotope tracer methods to studies of energy metabolism in malnourished populations of developing countries.
NAHRES-30 1996 IAEA Vienna.
11.
"Pneumococcal infections in Pakistan: a country report".
In Adult Immunization in Asia, Fondation Mercel Merieux, Lyon, 1998. pp 79-82.
12.
“Factors affecting protein and aminoacid metabolism in childhood from developing countries".
In Child Nutrition: an international perspective.
Editors Solomons NW, Caballero B, Brown KH.
CRC Press 1998.
13.
"Protein Digestion and Bioavailability".
In Encyclopedia of Human Nutrition.
Editors: Sadler M, Strain JJ, Caballero B.
Academic Press (London), 1998 pp.1646-54.
14.
"Perinatal Care in Pakistan.
Reproductive Health: A manual for family practice and primary health care.
Bhutta Z A, Maqbool S.  College of Physicians and Surgeons, Pakistan, Karachi, 1999, pp 69-78.
15.
“Effective interventions to reduce neonatal mortality and morbidity from perinatal infection.
Bhutta ZA.
In Costello A, Manandhar D (eds).
"Improving Newborn Infant Health in Developing Countries’ 1999.
Imperial College Press, London pp.289-308.
16.
“Ambulatory management of typhoid fever”            “Risk factors and management of micronutrient deficiencies”            “Management of persistent diarrhoea in developing countries”.
In Manual of International Child Health, British Medical Journal, 2000 (in press).
17.
“The role of Cefixime in typhoid fever during childhood” in Cefixime, Adam D, Quintiliani R (Eds), Torre-Lazur-McCann, Tokyo, 2000; pp.107-112.
18.
"Micronutrients and Child Health in the Commonwealth”, Commonwealth Foundation" (UK) (2001).
19.
"Isotopic evaluation of breast milk intake, energy metabolism growth and body composition of exclusively breastfed infants in Pakistan".
Bhutta ZA, Nizami SQ, Weaver LT, Preston T. In Application of Stable Isotopes to evaluate Growth and Body Composition of Exclusively Breastfed infants, IAEA and WHO, NAHRES Report.
2000.
20.
“Typhoid Fever in Childhood: the south Asian experience”.
Ahmad K &Bhutta ZA.
In "Recent Advances in Paediatrics", Gupte S (Ed), 2000, India .
21.
“Neonatal Infections in developing countries” in  Carrera JM, Cabero L, Baraibar R (Eds).
The Perinatal Medicine of the new Millennium.
\ No newline at end of file
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import logging
import os
import sys
import random
import time
import math
from functools import partial
import numpy as np
import paddle
from paddle.io import DataLoader
from paddle.metric import Metric, Accuracy, Precision, Recall
from paddlenlp.datasets import GlueCoLA, GlueSST2, GlueMRPC, GlueSTSB, GlueQQP, GlueMNLI, GlueQNLI, GlueRTE
from paddlenlp.data import Stack, Tuple, Pad
from paddlenlp.transformers import BertForSequenceClassification, BertTokenizer
from paddlenlp.transformers import ElectraForSequenceClassification, ElectraTokenizer
from paddlenlp.transformers import ErnieForSequenceClassification, ErnieTokenizer
from paddlenlp.metrics import AccuracyAndF1, Mcc, PearsonAndSpearman
FORMAT = '%(asctime)s-%(levelname)s: %(message)s'
logging.basicConfig(level=logging.INFO, format=FORMAT)
logger = logging.getLogger(__name__)
TASK_CLASSES = {
"cola": (GlueCoLA, Mcc),
"sst-2": (GlueSST2, Accuracy),
"mrpc": (GlueMRPC, AccuracyAndF1),
"sts-b": (GlueSTSB, PearsonAndSpearman),
"qqp": (GlueQQP, AccuracyAndF1),
"mnli": (GlueMNLI, Accuracy),
"qnli": (GlueQNLI, Accuracy),
"rte": (GlueRTE, Accuracy),
}
MODEL_CLASSES = {"bert": (BertForSequenceClassification, BertTokenizer)}
def parse_args():
parser = argparse.ArgumentParser()
# Required parameters
parser.add_argument(
"--task_name",
default=None,
type=str,
required=True,
help="The name of the task to train selected in the list: " +
", ".join(TASK_CLASSES.keys()), )
parser.add_argument(
"--model_type",
default=None,
type=str,
required=True,
help="Model type selected in the list: " +
", ".join(MODEL_CLASSES.keys()), )
parser.add_argument(
"--model_name_or_path",
default=None,
type=str,
required=True,
help="Path to pre-trained model or shortcut name selected in the list: "
+ ", ".join(
sum([
list(classes[-1].pretrained_init_configuration.keys())
for classes in MODEL_CLASSES.values()
], [])), )
parser.add_argument(
"--output_dir",
default=None,
type=str,
required=True,
help="The output directory where the model predictions and checkpoints will be written.",
)
parser.add_argument(
"--max_seq_length",
default=128,
type=int,
help="The maximum total input sequence length after tokenization. Sequences longer "
"than this will be truncated, sequences shorter will be padded.", )
parser.add_argument(
"--learning_rate",
default=1e-4,
type=float,
help="The initial learning rate for Adam.")
parser.add_argument(
"--num_train_epochs",
default=3,
type=int,
help="Total number of training epochs to perform.", )
parser.add_argument(
"--logging_steps",
type=int,
default=100,
help="Log every X updates steps.")
parser.add_argument(
"--save_steps",
type=int,
default=100,
help="Save checkpoint every X updates steps.")
parser.add_argument(
"--batch_size",
default=32,
type=int,
help="Batch size per GPU/CPU for training.", )
parser.add_argument(
"--weight_decay",
default=0.0,
type=float,
help="Weight decay if we apply some.")
parser.add_argument(
"--warmup_steps",
default=0,
type=int,
help="Linear warmup over warmup_steps. If > 0: Override warmup_proportion"
)
parser.add_argument(
"--warmup_proportion",
default=0.,
type=float,
help="Linear warmup proportion over total steps.")
parser.add_argument(
"--adam_epsilon",
default=1e-6,
type=float,
help="Epsilon for Adam optimizer.")
parser.add_argument(
"--max_steps",
default=-1,
type=int,
help="If > 0: set total number of training steps to perform. Override num_train_epochs.",
)
parser.add_argument(
"--seed", default=42, type=int, help="random seed for initialization")
parser.add_argument(
"--n_gpu",
default=1,
type=int,
help="number of gpus to use, 0 for cpu.")
args = parser.parse_args()
return args
def set_seed(args):
random.seed(args.seed + paddle.distributed.get_rank())
np.random.seed(args.seed + paddle.distributed.get_rank())
paddle.seed(args.seed + paddle.distributed.get_rank())
def evaluate(model, loss_fct, metric, data_loader):
model.eval()
metric.reset()
for batch in data_loader:
input_ids, segment_ids, labels = batch
logits = model(input_ids, segment_ids)
loss = loss_fct(logits, labels)
correct = metric.compute(logits, labels)
metric.update(correct)
res = metric.accumulate()
if isinstance(metric, AccuracyAndF1):
logger.info(
"eval loss: %f, acc: %s, precision: %s, recall: %s, f1: %s, acc and f1: %s."
% (loss.numpy(), res[0], res[1], res[2], res[3], res[4]))
elif isinstance(metric, Mcc):
logger.info("eval loss: %f, mcc: %s." % (loss.numpy(), res[0]))
elif isinstance(metric, PearsonAndSpearman):
logger.info(
"eval loss: %f, pearson: %s, spearman: %s, pearson and spearman: %s."
% (loss.numpy(), res[0], res[1], res[2]))
else:
logger.info("eval loss: %f, acc: %s." % (loss.numpy(), res))
model.train()
def convert_example(example,
tokenizer,
label_list,
max_seq_length=512,
is_test=False):
"""convert a glue example into necessary features"""
def _truncate_seqs(seqs, max_seq_length):
if len(seqs) == 1: # single sentence
# Account for [CLS] and [SEP] with "- 2"
seqs[0] = seqs[0][0:(max_seq_length - 2)]
else: # Sentence pair
# Account for [CLS], [SEP], [SEP] with "- 3"
tokens_a, tokens_b = seqs
max_seq_length -= 3
while True: # Truncate with longest_first strategy
total_length = len(tokens_a) + len(tokens_b)
if total_length <= max_seq_length:
break
if len(tokens_a) > len(tokens_b):
tokens_a.pop()
else:
tokens_b.pop()
return seqs
def _concat_seqs(seqs, separators, seq_mask=0, separator_mask=1):
concat = sum((seq + sep for sep, seq in zip(separators, seqs)), [])
segment_ids = sum(
([i] * (len(seq) + len(sep))
for i, (sep, seq) in enumerate(zip(separators, seqs))), [])
if isinstance(seq_mask, int):
seq_mask = [[seq_mask] * len(seq) for seq in seqs]
if isinstance(separator_mask, int):
separator_mask = [[separator_mask] * len(sep) for sep in separators]
p_mask = sum((s_mask + mask
for sep, seq, s_mask, mask in zip(
separators, seqs, seq_mask, separator_mask)), [])
return concat, segment_ids, p_mask
if not is_test:
# `label_list == None` is for regression task
label_dtype = "int64" if label_list else "float32"
# Get the label
label = example[-1]
example = example[:-1]
# Create label maps if classification task
if label_list:
label_map = {}
for (i, l) in enumerate(label_list):
label_map[l] = i
label = label_map[label]
label = np.array([label], dtype=label_dtype)
# Tokenize raw text
tokens_raw = [tokenizer(l) for l in example]
# Truncate to the truncate_length,
tokens_trun = _truncate_seqs(tokens_raw, max_seq_length)
# Concate the sequences with special tokens
tokens_trun[0] = [tokenizer.cls_token] + tokens_trun[0]
tokens, segment_ids, _ = _concat_seqs(tokens_trun, [[tokenizer.sep_token]] *
len(tokens_trun))
# Convert the token to ids
input_ids = tokenizer.convert_tokens_to_ids(tokens)
valid_length = len(input_ids)
# The mask has 1 for real tokens and 0 for padding tokens. Only real
# tokens are attended to.
# input_mask = [1] * len(input_ids)
if not is_test:
return input_ids, segment_ids, valid_length, label
else:
return input_ids, segment_ids, valid_length
def do_train(args):
paddle.set_device("gpu" if args.n_gpu else "cpu")
if paddle.distributed.get_world_size() > 1:
paddle.distributed.init_parallel_env()
set_seed(args)
args.task_name = args.task_name.lower()
dataset_class, metric_class = TASK_CLASSES[args.task_name]
args.model_type = args.model_type.lower()
model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
train_dataset = dataset_class.get_datasets(["train"])
tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path)
trans_func = partial(
convert_example,
tokenizer=tokenizer,
label_list=train_dataset.get_labels(),
max_seq_length=args.max_seq_length)
train_dataset = train_dataset.apply(trans_func, lazy=True)
train_batch_sampler = paddle.io.DistributedBatchSampler(
train_dataset, batch_size=args.batch_size, shuffle=True)
batchify_fn = lambda samples, fn=Tuple(
Pad(axis=0, pad_val=tokenizer.pad_token_id), # input
Pad(axis=0, pad_val=tokenizer.pad_token_id), # segment
Stack(), # length
Stack(dtype="int64" if train_dataset.get_labels() else "float32") # label
): [data for i, data in enumerate(fn(samples)) if i != 2]
train_data_loader = DataLoader(
dataset=train_dataset,
batch_sampler=train_batch_sampler,
collate_fn=batchify_fn,
num_workers=0,
return_list=True)
if args.task_name == "mnli":
dev_dataset_matched, dev_dataset_mismatched = dataset_class.get_datasets(
["dev_matched", "dev_mismatched"])
dev_dataset_matched = dev_dataset_matched.apply(trans_func, lazy=True)
dev_dataset_mismatched = dev_dataset_mismatched.apply(
trans_func, lazy=True)
dev_batch_sampler_matched = paddle.io.BatchSampler(
dev_dataset_matched, batch_size=args.batch_size, shuffle=False)
dev_data_loader_matched = DataLoader(
dataset=dev_dataset_matched,
batch_sampler=dev_batch_sampler_matched,
collate_fn=batchify_fn,
num_workers=0,
return_list=True)
dev_batch_sampler_mismatched = paddle.io.BatchSampler(
dev_dataset_mismatched, batch_size=args.batch_size, shuffle=False)
dev_data_loader_mismatched = DataLoader(
dataset=dev_dataset_mismatched,
batch_sampler=dev_batch_sampler_mismatched,
collate_fn=batchify_fn,
num_workers=0,
return_list=True)
else:
dev_dataset = dataset_class.get_datasets(["dev"])
dev_dataset = dev_dataset.apply(trans_func, lazy=True)
dev_batch_sampler = paddle.io.BatchSampler(
dev_dataset, batch_size=args.batch_size, shuffle=False)
dev_data_loader = DataLoader(
dataset=dev_dataset,
batch_sampler=dev_batch_sampler,
collate_fn=batchify_fn,
num_workers=0,
return_list=True)
num_classes = 1 if train_dataset.get_labels() == None else len(
train_dataset.get_labels())
model = model_class.from_pretrained(
args.model_name_or_path, num_classes=num_classes)
if paddle.distributed.get_world_size() > 1:
model = paddle.DataParallel(model)
num_training_steps = args.max_steps if args.max_steps > 0 else (
len(train_data_loader) * args.num_train_epochs)
warmup_steps = args.warmup_steps if args.warmup_steps > 0 else (
int(math.floor(num_training_steps * args.warmup_proportion)))
lr_scheduler = paddle.optimizer.lr.LambdaDecay(
args.learning_rate,
lambda current_step, num_warmup_steps=warmup_steps,
num_training_steps=num_training_steps : float(
current_step) / float(max(1, num_warmup_steps))
if current_step < num_warmup_steps else max(
0.0,
float(num_training_steps - current_step) / float(
max(1, num_training_steps - num_warmup_steps))))
optimizer = paddle.optimizer.AdamW(
learning_rate=lr_scheduler,
beta1=0.9,
beta2=0.999,
epsilon=args.adam_epsilon,
parameters=model.parameters(),
weight_decay=args.weight_decay,
apply_decay_param_fun=lambda x: x in [
p.name for n, p in model.named_parameters()
if not any(nd in n for nd in ["bias", "norm"])
])
loss_fct = paddle.nn.loss.CrossEntropyLoss() if train_dataset.get_labels(
) else paddle.nn.loss.MSELoss()
metric = metric_class()
global_step = 0
tic_train = time.time()
for epoch in range(args.num_train_epochs):
for step, batch in enumerate(train_data_loader):
global_step += 1
input_ids, segment_ids, labels = batch
logits = model(input_ids, segment_ids)
loss = loss_fct(logits, labels)
loss.backward()
optimizer.step()
lr_scheduler.step()
optimizer.clear_gradients()
if global_step % args.logging_steps == 0:
logger.info(
"global step %d/%d, epoch: %d, batch: %d, rank_id: %s, loss: %f, lr: %.10f, speed: %.4f step/s"
% (global_step, num_training_steps, epoch, step,
paddle.distributed.get_rank(), loss, optimizer.get_lr(),
args.logging_steps / (time.time() - tic_train)))
tic_train = time.time()
if global_step % args.save_steps == 0:
tic_eval = time.time()
if args.task_name == "mnli":
evaluate(model, loss_fct, metric, dev_data_loader_matched)
evaluate(model, loss_fct, metric,
dev_data_loader_mismatched)
logger.info("eval done total : %s s" %
(time.time() - tic_eval))
else:
evaluate(model, loss_fct, metric, dev_data_loader)
logger.info("eval done total : %s s" %
(time.time() - tic_eval))
if (not args.n_gpu > 1) or paddle.distributed.get_rank() == 0:
output_dir = os.path.join(
args.output_dir, "%s_ft_model_%d.pdparams" %
(args.task_name, global_step))
if not os.path.exists(output_dir):
os.makedirs(output_dir)
# Need better way to get inner model of DataParallel
model_to_save = model._layers if isinstance(
model, paddle.DataParallel) else model
model_to_save.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
def print_arguments(args):
"""print arguments"""
print('----------- Configuration Arguments -----------')
for arg, value in sorted(vars(args).items()):
print('%s: %s' % (arg, value))
print('------------------------------------------------')
if __name__ == "__main__":
args = parse_args()
print_arguments(args)
if args.n_gpu > 1:
paddle.distributed.spawn(do_train, args=(args, ), nprocs=args.n_gpu)
else:
do_train(args)
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import collections
import itertools
import logging
import os
import random
import time
import h5py
from functools import partial
from concurrent.futures import ThreadPoolExecutor
import numpy as np
import paddle
import paddle.distributed as dist
from paddle.io import DataLoader, Dataset
from paddlenlp.data import Stack, Tuple, Pad
from paddlenlp.transformers import BertForPretraining, BertModel, BertPretrainingCriterion
from paddlenlp.transformers import BertTokenizer
FORMAT = '%(asctime)s-%(levelname)s: %(message)s'
logging.basicConfig(level=logging.INFO, format=FORMAT)
logger = logging.getLogger(__name__)
MODEL_CLASSES = {
"bert": (BertForPretraining, BertTokenizer),
}
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument(
"--model_type",
default=None,
type=str,
required=True,
help="Model type selected in the list: " +
", ".join(MODEL_CLASSES.keys()),
)
parser.add_argument(
"--model_name_or_path",
default=None,
type=str,
required=True,
help="Path to pre-trained model or shortcut name selected in the list: "
+ ", ".join(
sum([
list(classes[-1].pretrained_init_configuration.keys())
for classes in MODEL_CLASSES.values()
], [])),
)
parser.add_argument(
"--input_dir",
default=None,
type=str,
required=True,
help="The input directory where the data will be read from.",
)
parser.add_argument(
"--output_dir",
default=None,
type=str,
required=True,
help=
"The output directory where the model predictions and checkpoints will be written.",
)
parser.add_argument(
"--max_predictions_per_seq",
default=80,
type=int,
help="The maximum total of masked tokens in input sequence")
parser.add_argument(
"--batch_size",
default=8,
type=int,
help="Batch size per GPU/CPU for training.",
)
parser.add_argument("--learning_rate",
default=5e-5,
type=float,
help="The initial learning rate for Adam.")
parser.add_argument("--weight_decay",
default=0.0,
type=float,
help="Weight decay if we apply some.")
parser.add_argument("--adam_epsilon",
default=1e-8,
type=float,
help="Epsilon for Adam optimizer.")
parser.add_argument("--max_grad_norm",
default=1.0,
type=float,
help="Max gradient norm.")
parser.add_argument(
"--num_train_epochs",
default=3,
type=int,
help="Total number of training epochs to perform.",
)
parser.add_argument(
"--max_steps",
default=-1,
type=int,
help=
"If > 0: set total number of training steps to perform. Override num_train_epochs.",
)
parser.add_argument("--warmup_steps",
default=0,
type=int,
help="Linear warmup over warmup_steps.")
parser.add_argument("--logging_steps",
type=int,
default=500,
help="Log every X updates steps.")
parser.add_argument("--save_steps",
type=int,
default=500,
help="Save checkpoint every X updates steps.")
parser.add_argument("--seed",
type=int,
default=42,
help="random seed for initialization")
parser.add_argument("--n_gpu",
type=int,
default=1,
help="number of gpus to use, 0 for cpu.")
args = parser.parse_args()
return args
def set_seed(args):
random.seed(args.seed + paddle.distributed.get_rank())
np.random.seed(args.seed + paddle.distributed.get_rank())
paddle.seed(args.seed + paddle.distributed.get_rank())
class WorkerInitObj(object):
def __init__(self, seed):
self.seed = seed
def __call__(self, id):
np.random.seed(seed=self.seed + id)
random.seed(self.seed + id)
def create_pretraining_dataset(input_file, max_pred_length, shared_list, args,
worker_init):
train_data = PretrainingDataset(input_file=input_file,
max_pred_length=max_pred_length)
# files have been sharded, no need to dispatch again
train_batch_sampler = paddle.io.BatchSampler(train_data,
batch_size=args.batch_size,
shuffle=True)
# DataLoader cannot be pickled because of its place.
# If it can be pickled, use global function instead of lambda and use
# ProcessPoolExecutor instead of ThreadPoolExecutor to prefetch.
def _collate_data(data, stack_fn=Stack()):
num_fields = len(data[0])
out = [None] * num_fields
# input_ids, segment_ids, input_mask, masked_lm_positions,
# masked_lm_labels, next_sentence_labels, mask_token_num
for i in (0, 1, 2, 5):
out[i] = stack_fn([x[i] for x in data])
batch_size, seq_length = out[0].shape
size = num_mask = sum(len(x[3]) for x in data)
# Padding for divisibility by 8 for fp16 or int8 usage
if size % 8 != 0:
size += 8 - (size % 8)
# masked_lm_positions
# Organize as a 1D tensor for gather or use gather_nd
out[3] = np.full(size, 0, dtype=np.int64)
# masked_lm_labels
out[4] = np.full([size, 1], -1, dtype=np.int64)
mask_token_num = 0
for i, x in enumerate(data):
for j, pos in enumerate(x[3]):
out[3][mask_token_num] = i * seq_length + pos
out[4][mask_token_num] = x[4][j]
mask_token_num += 1
# mask_token_num
out.append(np.asarray([mask_token_num], dtype=np.float32))
return out
train_data_loader = DataLoader(dataset=train_data,
batch_sampler=train_batch_sampler,
collate_fn=_collate_data,
num_workers=0,
worker_init_fn=worker_init,
return_list=True)
return train_data_loader, input_file
class PretrainingDataset(Dataset):
def __init__(self, input_file, max_pred_length):
self.input_file = input_file
self.max_pred_length = max_pred_length
f = h5py.File(input_file, "r")
keys = [
'input_ids', 'input_mask', 'segment_ids', 'masked_lm_positions',
'masked_lm_ids', 'next_sentence_labels'
]
self.inputs = [np.asarray(f[key][:]) for key in keys]
f.close()
def __len__(self):
'Denotes the total number of samples'
return len(self.inputs[0])
def __getitem__(self, index):
[
input_ids, input_mask, segment_ids, masked_lm_positions,
masked_lm_ids, next_sentence_labels
] = [
input[index].astype(np.int64)
if indice < 5 else np.asarray(input[index].astype(np.int64))
for indice, input in enumerate(self.inputs)
]
# TODO: whether to use reversed mask by changing 1s and 0s to be
# consistent with nv bert
input_mask = (1 - np.reshape(input_mask.astype(np.float32),
[1, 1, input_mask.shape[0]])) * -1e9
index = self.max_pred_length
# store number of masked tokens in index
# outputs of torch.nonzero diff with that of numpy.nonzero by zip
padded_mask_indices = (masked_lm_positions == 0).nonzero()[0]
if len(padded_mask_indices) != 0:
index = padded_mask_indices[0].item()
mask_token_num = index
else:
index = 0
mask_token_num = 0
# masked_lm_labels = np.full(input_ids.shape, -1, dtype=np.int64)
# masked_lm_labels[masked_lm_positions[:index]] = masked_lm_ids[:index]
masked_lm_labels = masked_lm_ids[:index]
masked_lm_positions = masked_lm_positions[:index]
# softmax_with_cross_entropy enforce last dim size equal 1
masked_lm_labels = np.expand_dims(masked_lm_labels, axis=-1)
next_sentence_labels = np.expand_dims(next_sentence_labels, axis=-1)
return [
input_ids, segment_ids, input_mask, masked_lm_positions,
masked_lm_labels, next_sentence_labels
]
def do_train(args):
paddle.set_device("gpu" if args.n_gpu else "cpu")
if paddle.distributed.get_world_size() > 1:
paddle.distributed.init_parallel_env()
set_seed(args)
worker_init = WorkerInitObj(args.seed + paddle.distributed.get_rank())
args.model_type = args.model_type.lower()
model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path)
model = BertForPretraining(
BertModel(**model_class.pretrained_init_configuration[
args.model_name_or_path]))
criterion = BertPretrainingCriterion(
getattr(model,
BertForPretraining.base_model_prefix).config["vocab_size"])
if paddle.distributed.get_world_size() > 1:
model = paddle.DataParallel(model)
# If use defalut last_epoch, lr of the first iteration is 0.
# Use `last_epoch = 0` to be consistent with nv bert.
lr_scheduler = paddle.optimizer.lr.LambdaDecay(
args.learning_rate,
lambda current_step, num_warmup_steps=args.warmup_steps,
num_training_steps=args.max_steps if args.max_steps > 0 else
(len(train_data_loader) * args.num_train_epochs): float(
current_step) / float(max(1, num_warmup_steps))
if current_step < num_warmup_steps else max(
0.0,
float(num_training_steps - current_step) / float(
max(1, num_training_steps - num_warmup_steps))),
last_epoch=0)
optimizer = paddle.optimizer.AdamW(
learning_rate=lr_scheduler,
epsilon=args.adam_epsilon,
parameters=model.parameters(),
weight_decay=args.weight_decay,
apply_decay_param_fun=lambda x: x in [
p.name for n, p in model.named_parameters()
if not any(nd in n for nd in ["bias", "norm"])
])
pool = ThreadPoolExecutor(1)
global_step = 0
tic_train = time.time()
for epoch in range(args.num_train_epochs):
files = [
os.path.join(args.input_dir, f) for f in os.listdir(args.input_dir)
if os.path.isfile(os.path.join(args.input_dir, f))
and "training" in f
]
files.sort()
num_files = len(files)
random.Random(args.seed + epoch).shuffle(files)
f_start_id = 0
shared_file_list = {}
if paddle.distributed.get_world_size() > num_files:
remainder = paddle.distributed.get_world_size() % num_files
data_file = files[
(f_start_id * paddle.distributed.get_world_size() +
paddle.distributed.get_rank() + remainder * f_start_id) %
num_files]
else:
data_file = files[(f_start_id * paddle.distributed.get_world_size()
+ paddle.distributed.get_rank()) % num_files]
previous_file = data_file
train_data_loader, _ = create_pretraining_dataset(
data_file, args.max_predictions_per_seq, shared_file_list, args,
worker_init)
# TODO(guosheng): better way to process single file
single_file = True if f_start_id + 1 == len(files) else False
for f_id in range(f_start_id, len(files)):
if not single_file and f_id == f_start_id:
continue
if paddle.distributed.get_world_size() > num_files:
data_file = files[(f_id * paddle.distributed.get_world_size() +
paddle.distributed.get_rank() +
remainder * f_id) % num_files]
else:
data_file = files[(f_id * paddle.distributed.get_world_size() +
paddle.distributed.get_rank()) % num_files]
previous_file = data_file
dataset_future = pool.submit(create_pretraining_dataset, data_file,
args.max_predictions_per_seq,
shared_file_list, args, worker_init)
for step, batch in enumerate(train_data_loader):
global_step += 1
(input_ids, segment_ids, input_mask, masked_lm_positions,
masked_lm_labels, next_sentence_labels,
masked_lm_scale) = batch
prediction_scores, seq_relationship_score = model(
input_ids=input_ids,
token_type_ids=segment_ids,
attention_mask=input_mask,
masked_positions=masked_lm_positions)
loss = criterion(prediction_scores, seq_relationship_score,
masked_lm_labels, next_sentence_labels,
masked_lm_scale)
if global_step % args.logging_steps == 0:
if (not args.n_gpu > 1
) or paddle.distributed.get_rank() == 0:
logger.info(
"global step %d, epoch: %d, batch: %d, loss: %f, speed: %.2f step/s"
% (global_step, epoch, step, loss,
args.logging_steps / (time.time() - tic_train)))
tic_train = time.time()
loss.backward()
optimizer.step()
lr_scheduler.step()
optimizer.clear_gradients()
if global_step % args.save_steps == 0:
if (not args.n_gpu > 1
) or paddle.distributed.get_rank() == 0:
output_dir = os.path.join(args.output_dir,
"model_%d" % global_step)
if not os.path.exists(output_dir):
os.makedirs(output_dir)
# need better way to get inner model of DataParallel
model_to_save = model._layers if isinstance(
model, paddle.DataParallel) else model
model_to_save.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
paddle.save(
optimizer.state_dict(),
os.path.join(output_dir, "model_state.pdopt"))
if global_step >= args.max_steps:
del train_data_loader
return
del train_data_loader
train_data_loader, data_file = dataset_future.result(timeout=None)
if __name__ == "__main__":
args = parse_args()
if args.n_gpu > 1:
paddle.distributed.spawn(do_train, args=(args, ), nprocs=args.n_gpu)
else:
do_train(args)
......@@ -18,15 +18,17 @@ ELMo(Embeddings from Language Models)是重要的通用语义表示模型之一
* PaddlePaddle 安装
本项目依赖于 PaddlePaddle 2.0 及以上版本,请参考 [安装指南](http://www.paddlepaddle.org/#quick-start) 进行安装
本项目依赖于 PaddlePaddle 2.0rc1 及以上版本,请参考 [安装指南](http://www.paddlepaddle.org/#quick-start) 进行安装
* 环境依赖
Python的版本要求 3.6+,并安装sklearn和gensim。其它环境请参考 PaddlePaddle [安装说明](https://www.paddlepaddle.org.cn/documentation/docs/zh/1.5/beginners_guide/install/index_cn.html) 部分的内容
Python的版本要求 3.6+
```shell
pip install sklearn gensim
```
本项目依赖sklearn和gensim,请在运行本项目之前进行安装
```shell
pip install sklearn gensim
```
### 代码结构说明
......
......@@ -5,8 +5,6 @@
## 1. 任务说明
本文主要介绍基于lstm的语言的模型的实现,给定一个输入词序列(中文分词、英文tokenize),计算其ppl(语言模型困惑度,用户表示句子的流利程度),基于循环神经网络语言模型的介绍可以[参阅论文](https://arxiv.org/abs/1409.2329)。相对于传统的方法,基于循环神经网络的方法能够更好的解决稀疏词的问题。
**目前语言模型要求使用PaddlePaddle 2.0及以上版本或适当的develop版本。**
## 2. 效果说明
......@@ -27,6 +25,22 @@
## 1. 开始第一次模型调用
### 安装说明
* PaddlePaddle 安装
本项目依赖于 PaddlePaddle 2.0-rc1 及以上版本,请参考 [安装指南](http://www.paddlepaddle.org/#quick-start) 进行安装
* PaddleNLP 安装
```shell
pip install paddlenlp>=2.0.0b
```
* 环境依赖
Python的版本要求 3.6+
### 数据准备
为了方便开发者进行测试,我们内置了数据下载脚本,默认自动下载PTB数据集。
......
......@@ -89,4 +89,3 @@ class UpdateModel(paddle.callbacks.Callback):
# This callback reset model hidden states and update learning rate before each epoch begins
def on_epoch_begin(self, epoch=None, logs=None):
self.model.network.reset_states()
# Language Model
## Transformer-XL
以下是本例的简要目录结构及说明:
```text
.
├── eval.py # 预测脚本
├── reader.py # 数据读取接口
├── README.md # 文档
├── train.py # 训练脚本
└── configs # 配置文件
```
## 模型简介
本项目是语言模型 Transformer-XL 的 PaddlePaddle 实现, 包含模型训练,预测等内容。
## 快速开始
### 安装说明
1. paddle安装
本项目依赖于 PaddlePaddle 2.0rc及以上版本或适当的develop版本,请参考 [安装指南](https://www.paddlepaddle.org.cn/install/quick) 进行安装
2. 下载代码
克隆代码库到本地
3. 环境依赖
该模型使用PaddlePaddle,关于环境依赖部分,请先参考PaddlePaddle[安装说明](https://www.paddlepaddle.org.cn/documentation/docs/zh/install/index_cn.html)关于环境依赖部分的内容。
此外,需要另外涉及:
* attrdict
* pyyaml
### 数据准备
公开数据集:enwik8、text8、wt103 多用于语言模型的 benchmark 测试。输出获取与处理方式如下:
```shell
bash gen_data.sh
```
会在当前路径下的 ./gen_data/ 路径下生成我们需要的数据。
### 单机训练
### 单机单卡
以提供的 enwik8 数据为例,可以执行以下命令进行模型训练:
```sh
# setting visible devices for training
export CUDA_VISIBLE_DEVICES=0
python train.py --config ./configs/enwik8.yaml
```
可以在 enwik8.yaml 文件中设置相应的参数,比如 `batch_size``epoch` 等。
### 单机多卡
同样,可以执行如下命令实现八卡训练:
```sh
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
python -m paddle.distributed.launch --gpus "0,1,2,3,4,5,6,7" train.py --config ./configs/enwik8.yaml
```
### 模型推断
以 enwik8 数据为例,模型训练完成后可以执行以下命令可以进行预测:
```sh
# setting visible devices for prediction
export CUDA_VISIBLE_DEVICES=0
python eval.py --config ./configs/enwik8.yaml
```
完成推断之后,会将显示在验证集和测试集上的结果。
## 参考文献
# The frequency to save trained models when training.
save_step: 10000
# The frequency to fetch and print output when training.
print_step: 100
# Path of the checkpoint, to resume the previous training
init_from_checkpoint: ""
# Path of the pretrain model, to better solve the current task
init_from_pretrain_model: ""
# Path of trained parameter, to make prediction
init_from_params: "./trained_models/step_final/"
# The directory for saving model
save_model: "trained_models"
# The directory for saving inference model.
inference_model_dir: "infer_model"
# Set seed for CE or debug
random_seed: None
# The path to data files
data: "./gen_data/enwik8/"
# The name of dataset
dataset: "enwik8"
# Whether to use cuda
use_gpu: True
# Args for reader, see reader.py for details
token_delimiter: None
batch_size: 16
eval_batch_size: 2
# Hyparams for training:
# The number of epoches for training
epoch: 30
# The hyper parameters for optimizer.
# Type of ptimizer.
optim: adam
# Learning rate schedule.
scheduler: cosine
# This static learning_rate will be applied to the LearningRateScheduler
# derived learning rate the to get the final learning rate.
learning_rate: 0.00025
# The hyper parameters for Adam optimizer.
beta1: 0.9
beta2: 0.997
eps: 1e-9
# The hyper parameters for Momentum optimizer.
mom: 0.0
# Global gradient clip.
clip: 0.25
# The parameters for learning rate scheduling.
warmup_steps: 0
# The parameters for CosineAnnealingDecay. Minimum learning rate.
eta_min: 0.0
# The parameters for ReduceLROnPlateau.
# The Ratio that the learning rate will be reduced.
decay_rate: 0.5
# When loss doesn’t improve for this number of epochs, learing rate will be reduced.
patience: 0
# The lower bound of the learning rate after reduction.
min_lr: 0.0
# Hyparams for model:
# Whe use adaptive softmax.
adaptive: False
# Size of dictionary. This can be obtained automatically.
ntokens: 10000
# The dimension for word embeddings, which is also the last dimension of
# the input and output of multi-head attention, position-wise feed-forward
# networks, encoder and decoder.
d_model: 512
# Dimension of heads.
d_head: 64
# Size of the hidden layer in position-wise feed-forward networks.
d_inner_hid: 2048
# Number of head used in multi-head attention.
n_head: 8
# Number of sub-layers to be stacked in the encoder and decoder.
n_layer: 12
# Dropout rates.
dropout: 0.1
# Attention dropout
attn_dropout: 0.0
# Attention type for decoder.
# 0 for relative partial MHA (in Transformer-XL).
# 1 for relative MHA (in Shaw et al).
attn_type: 0
# Apply layer normalization before or after sublayers.
normalize_before: False
# Whether to tie weight or not.
tie_weight: True
# The length of the extended context.
ext_len: 0
# The divident value for softmax and adapative input.
div_val: 1
# Target length. The number of tokens to predict.
tgt_len: 512
# Memory length. The length of the retained previous heads.
mem_len: 512
# Use the same attention length for all tokens.
same_length: False
# Use the same positional encoding after clamp len.
clamp_len: -1
# The number of samples in sample softmax. -1 means do not use sampled softmax.
sample_softmax: -1
# Max step for training.
max_step: 400000
# Target length for evaluation. That is, the number of tokens to predict for evaluation.
eval_tgt_len: 128
# What kind of mode for evaluation. valid, test or both("all").
mode: "all"
# Maximum evaluation step.
max_eval_steps: -1
# The frequency to save trained models when training.
save_step: 10000
# The frequency to fetch and print output when training.
print_step: 100
# Path of the checkpoint, to resume the previous training
init_from_checkpoint: ""
# Path of the pretrain model, to better solve the current task
init_from_pretrain_model: ""
# Path of trained parameter, to make prediction
init_from_params: "./trained_models/step_final/"
# The directory for saving model
save_model: "trained_models"
# The directory for saving inference model.
inference_model_dir: "infer_model"
# Set seed for CE or debug
random_seed: None
# The path to data files
data: "./gen_data/text8/"
# The name of dataset
dataset: "text8"
# Whether to use cuda
use_gpu: True
# Args for reader, see reader.py for details
token_delimiter: None
batch_size: 15
eval_batch_size: 5
# Hyparams for training:
# The number of epoches for training
epoch: 30
# The hyper parameters for optimizer.
# Type of ptimizer.
optim: adam
# Learning rate schedule.
scheduler: cosine
# This static learning_rate will be applied to the LearningRateScheduler
# derived learning rate the to get the final learning rate.
learning_rate: 0.00025
# The hyper parameters for Adam optimizer.
beta1: 0.9
beta2: 0.997
eps: 1e-9
# The hyper parameters for Momentum optimizer.
mom: 0.0
# Global gradient clip.
clip: 0.25
# The parameters for learning rate scheduling.
warmup_steps: 0
# The parameters for CosineAnnealingDecay. Minimum learning rate.
eta_min: 0.0
# The parameters for ReduceLROnPlateau.
# The Ratio that the learning rate will be reduced.
decay_rate: 0.5
# When loss doesn’t improve for this number of epochs, learing rate will be reduced.
patience: 0
# The lower bound of the learning rate after reduction.
min_lr: 0.0
# Hyparams for model:
# Whe use adaptive softmax.
adaptive: False
# Size of dictionary. This can be obtained automatically.
ntokens: 10000
# The dimension for word embeddings, which is also the last dimension of
# the input and output of multi-head attention, position-wise feed-forward
# networks, encoder and decoder.
d_model: 512
# Dimension of heads.
d_head: 64
# Size of the hidden layer in position-wise feed-forward networks.
d_inner_hid: 2048
# Number of head used in multi-head attention.
n_head: 8
# Number of sub-layers to be stacked in the encoder and decoder.
n_layer: 12
# Dropout rates.
dropout: 0.1
# Attention dropout
attn_dropout: 0.0
# Attention type for decoder.
# 0 for relative partial MHA (in Transformer-XL).
# 1 for relative MHA (in Shaw et al).
attn_type: 0
# Apply layer normalization before or after sublayers.
normalize_before: False
# Whether to tie weight or not.
tie_weight: True
# The length of the extended context.
ext_len: 0
# The divident value for softmax and adapative input.
div_val: 1
# Target length. The number of tokens to predict.
tgt_len: 512
# Memory length. The length of the retained previous heads.
mem_len: 512
# Use the same attention length for all tokens.
same_length: False
# Use the same positional encoding after clamp len.
clamp_len: -1
# The number of samples in sample softmax. -1 means do not use sampled softmax.
sample_softmax: -1
# Max step for training.
max_step: 400000
# Target length for evaluation. That is, the number of tokens to predict for evaluation.
eval_tgt_len: 128
# What kind of mode for evaluation. valid, test or both("all").
mode: "all"
# Maximum evaluation step.
max_eval_steps: -1
# The frequency to save trained models when training.
save_step: 10000
# The frequency to fetch and print output when training.
print_step: 100
# Path of the checkpoint, to resume the previous training
init_from_checkpoint: ""
# Path of the pretrain model, to better solve the current task
init_from_pretrain_model: ""
# Path of trained parameter, to make prediction
init_from_params: "./trained_models/step_final/"
# The directory for saving model
save_model: "trained_models"
# The directory for saving inference model.
inference_model_dir: "infer_model"
# Set seed for CE or debug
random_seed: None
# The path to data files
data: "./gen_data/wikitext-103/"
# The name of dataset
dataset: "wt103"
# Whether to use cuda
use_gpu: True
# Args for reader, see reader.py for details
token_delimiter: None
batch_size: 32
eval_batch_size: 5
# Hyparams for training:
# The number of epoches for training
epoch: 30
# The hyper parameters for optimizer.
# Type of ptimizer.
optim: adam
# Learning rate schedule.
scheduler: cosine
# This static learning_rate will be applied to the LearningRateScheduler
# derived learning rate the to get the final learning rate.
learning_rate: 0.00025
# The hyper parameters for Adam optimizer.
beta1: 0.9
beta2: 0.997
eps: 1e-9
# The hyper parameters for Momentum optimizer.
mom: 0.0
# Global gradient clip.
clip: 0.25
# The parameters for learning rate scheduling.
warmup_steps: 0
# The parameters for CosineAnnealingDecay. Minimum learning rate.
eta_min: 0.0
# The parameters for ReduceLROnPlateau.
# The Ratio that the learning rate will be reduced.
decay_rate: 0.5
# When loss doesn’t improve for this number of epochs, learing rate will be reduced.
patience: 0
# The lower bound of the learning rate after reduction.
min_lr: 0.0
# Hyparams for model:
# Whe use adaptive softmax.
adaptive: True
# Size of dictionary. This can be obtained automatically.
ntokens: 10000
# The dimension for word embeddings, which is also the last dimension of
# the input and output of multi-head attention, position-wise feed-forward
# networks, encoder and decoder.
d_model: 410
# Dimension of heads.
d_head: 41
# Size of the hidden layer in position-wise feed-forward networks.
d_inner_hid: 2100
# Number of head used in multi-head attention.
n_head: 10
# Number of sub-layers to be stacked in the encoder and decoder.
n_layer: 16
# Dropout rates.
dropout: 0.1
# Attention dropout
attn_dropout: 0.0
# Attention type for decoder.
# 0 for relative partial MHA (in Transformer-XL).
# 1 for relative MHA (in Shaw et al).
attn_type: 0
# Apply layer normalization before or after sublayers.
normalize_before: False
# Whether to tie weight or not.
tie_weight: True
# The length of the extended context.
ext_len: 0
# The divident value for softmax and adapative input.
div_val: 1
# Target length. The number of tokens to predict.
tgt_len: 150
# Memory length. The length of the retained previous heads.
mem_len: 150
# Target length for evaluation. That is, the number of tokens to predict for evaluation.
eval_tgt_len: 150
# Use the same attention length for all tokens.
same_length: False
# Use the same positional encoding after clamp len.
clamp_len: -1
# The number of samples in sample softmax. -1 means do not use sampled softmax.
sample_softmax: -1
# Max step for training.
max_step: 200000
# What kind of mode for evaluation. valid, test or both("all").
mode: "all"
# Maximum evaluation step.
max_eval_steps: -1
import os
import time
import yaml
import logging
import argparse
import numpy as np
from pprint import pprint
from attrdict import AttrDict
import paddle
from reader import get_lm_vocab, get_lm_data_loader
from mem_transformer import MemTransformerLM
FORMAT = '%(asctime)s-%(levelname)s: %(message)s'
logging.basicConfig(level=logging.INFO, format=FORMAT)
logger = logging.getLogger(__name__)
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument(
"--config",
default="./configs/enwik8.yaml",
type=str,
help="Path of the config file. ")
args = parser.parse_args()
return args
def do_eval(args):
assert args.ext_len >= 0, 'Extended context length must be no less than 0'
def _evaluate(loader):
total_len, total_loss = 0, 0.
eval_mems = tuple()
for i, (src, target, seq_len) in enumerate(loader):
if args.max_eval_steps > 0 and i >= args.max_eval_steps:
break
ret = mem_transformer(src, target, *eval_mems)
loss, eval_mems = ret[0], ret[1:]
seq_len = seq_len.numpy()
eval_cur_loss = seq_len * loss.numpy()
total_loss += eval_cur_loss
total_len += seq_len
return total_loss / total_len
def _logger(loss):
if args.dataset in ['enwik8', 'text8']:
logger_info = "loss: %f, bpc: %f" % \
(loss, loss / np.log(2))
else:
logger_info = "loss: %f, ppl: %.2f" % \
(loss, np.exp(loss))
return logger_info
vocab = get_lm_vocab(args)
eval_loader = get_lm_data_loader(args, vocab, "valid")
test_loader = get_lm_data_loader(args, vocab, "test")
cutoffs, tie_projs = [], [False]
if args.adaptive:
assert args.dataset in ['wt103', 'lm1b']
if args.dataset == 'wt103':
cutoffs = [20000, 40000, 200000]
tie_projs += [True] * len(cutoffs)
elif args.dataset == 'lm1b':
cutoffs = [60000, 100000, 640000]
tie_projs += [False] * len(cutoffs)
mem_transformer = MemTransformerLM(
args.ntokens,
args.n_layer,
args.n_head,
args.d_model,
args.d_head,
args.d_inner_hid,
args.dropout,
args.attn_dropout,
tie_weight=args.tie_weight,
d_embed=args.d_model,
div_val=args.div_val,
tie_projs=tie_projs,
normalize_before=args.normalize_before,
tgt_len=args.tgt_len,
ext_len=args.ext_len,
mem_len=args.mem_len,
cutoffs=cutoffs,
same_length=args.same_length,
attn_type=args.attn_type,
clamp_len=args.clamp_len,
sample_softmax=args.sample_softmax)
assert args.init_from_params, (
"Please set init_from_params to load the infer model.")
model_dict = paddle.load(
os.path.join(args.init_from_params, "mem_transformer.pdparams"))
mem_transformer.load_dict(model_dict)
logger.info(
"Evaluating with bsz {} tgt_len {} ext_len {} mem_len {} clamp_len {}".
format(args.batch_size, args.tgt_len, args.ext_len, args.mem_len,
args.clamp_len))
mem_transformer.reset_length(args.tgt_len, args.ext_len, args.mem_len)
test_loss = None
valid_loss = None
if args.mode == 'all':
test_loss = _evaluate(test_loader)
valid_loss = _evaluate(eval_loader)
elif args.mode == 'valid':
valid_loss = _evaluate(eval_loader)
elif args.mode == 'test':
test_loss = _evaluate(test_loader)
logger_info = ''
if valid_loss is not None:
logger_info = logger_info + _logger(valid_loss)
if test_loss is not None:
logger_info = logger_info + _logger(test_loss)
logger.info(logger_info)
if __name__ == "__main__":
ARGS = parse_args()
yaml_file = ARGS.config
with open(yaml_file, 'rt') as f:
args = AttrDict(yaml.safe_load(f))
pprint(args)
do_eval(args)
echo "Downloading dataset..."
CUR_DIR=$PWD
mkdir -p gen_data
cd ./gen_data/
if [ ! -d "wikitext-103" ]; then
echo "Downloading wikitext-103..."
wget -O wikitext-103-v1.zip https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-v1.zip
echo "Unzip wikitext-103..."
unzip wikitext-103-v1.zip
cd wikitext-103
# Rename
mv wiki.train.tokens train.txt
mv wiki.valid.tokens valid.txt
mv wiki.test.tokens test.txt
cd -
fi
if [ ! -d 'enwik8' ]; then
mkdir -p enwik8
cd enwik8
echo "Downloading enwik8..."
wget -O enwik8.zip http://mattmahoney.net/dc/enwik8.zip
wget -O prep_enwik8.py https://raw.githubusercontent.com/salesforce/awd-lstm-lm/master/data/enwik8/prep_enwik8.py
python3 prep_enwik8.py
rm -f prep_enwik8.py
cd -
fi
if [ ! -d 'text8' ]; then
mkdir -p text8
cd text8
echo "Downloading text8..."
wget -O text8.zip http://mattmahoney.net/dc/text8.zip
python ${CUR_DIR}/utils/preprocess_text8.py 5000000
cd -
fi
if [ ! -d 'one-billion-words' ]; then
mkdir -p one-billion-words
cd one-billion-words
echo "Downloading one-billion-words..."
wget -O 1-billion-word-language-modeling-benchmark-r13output.tar.gz http://www.statmt.org/lm-benchmark/1-billion-word-language-modeling-benchmark-r13output.tar.gz
tar xzf 1-billion-word-language-modeling-benchmark-r13output.tar.gz
dir="./1-billion-word-language-modeling-benchmark-r13output/heldout-monolingual.tokenized.shuffled/"
cat ${dir}/news.en.heldout-00000-of-00050 > valid.txt
cat ${dir}/news.en.heldout-00000-of-00050 > test.txt
wget -O 1b_word_vocab.txt https://github.com/rafaljozefowicz/lm/raw/master/1b_word_vocab.txt
cd -
fi
echo "All done. "
import os
import numpy as np
from paddlenlp.data import Vocab
import paddle
from paddle.io import IterableDataset, DataLoader
import paddle.distributed as dist
class LMDataset(IterableDataset):
def __init__(self, mode, vocab, path, dataset_name, batch_size, bptt,
ext_len, nranks, rank):
assert (mode in ["train", "valid", "test"]
), "Parameter mode must be one of [train, valid, test]."
super(LMDataset, self).__init__()
self.vocab = vocab
self.dataset_name = dataset_name
if self.dataset_name in ["wt103"]:
self.data = self.read_raw_data(
filename=os.path.join(path, mode + ".txt"), ordered=True)
elif self.dataset_name in ["enwik8", "text8"]:
self.data = self.read_raw_data(
filename=os.path.join(path, mode + ".txt"),
ordered=True,
add_eos=False)
else:
raise ValueError("Not supported dataset yet. ")
self.rank = rank
self.batch_size = batch_size
batch_size *= nranks
self.bptt = bptt
self.ext_len = ext_len if ext_len is not None else 0
self.num_step = len(self.data) // batch_size
data = self.data[:self.num_step * batch_size]
self.data = data.reshape([batch_size, -1])
# Number of samples
self.num_samples = (self.num_step + self.bptt - 1) // self.bptt
def __len__(self):
return self.num_samples
def __iter__(self):
for i in range(0, self.data.shape[1] - 1, self.bptt):
seq_len = min(self.bptt, self.data.shape[1] - 1 - i)
end_idx = i + seq_len
beg_idx = max(0, i - self.ext_len)
src = self.data[:, beg_idx:end_idx]
target = self.data[:, i + 1:i + 1 + seq_len]
# NOTE: `seq_len` will be transfered to numpy immediately
# after returned by DataLoader. Hence, `seq_len` can be
# yield as `int`. And the returned tensor `seq_len`'s shape
# will be empty [].
# However, if it's necessary to use `seq_len` as input for some
# PaddlePaddle op, then it must be returned by `[seq_len]` whose
# shape is [1], cause some op cannot use shape [] as input.
yield [
src[self.rank * self.batch_size:(self.rank + 1) *
self.batch_size], target[self.rank * self.batch_size:(
self.rank + 1) * self.batch_size], seq_len
]
def read_raw_data(self,
filename,
ordered=False,
lower_case=True,
delimiter=None,
add_eos=True,
add_double_eos=False):
assert os.path.exists(filename), "%s is not exist. " % filename
data = []
with open(filename, 'r', encoding='utf-8') as f:
for line in f:
tokens = LMDataset.tokenize(
line=line, delimiter=delimiter, lower_case=lower_case)
if add_double_eos: # for lm1b
tokens = [self.vocab._identifiers_to_tokens['bos_token']
] + tokens + [
self.vocab._identifiers_to_tokens['bos_token']
]
elif add_eos:
tokens = tokens + [
self.vocab._identifiers_to_tokens['eos_token']
]
data.append(
np.asarray(self.get_indices(tokens)).astype("int64"))
if ordered:
data = np.concatenate(data)
return data
def get_indices(self, tokens):
return self.vocab.to_indices(tokens)
@classmethod
def get_vocab(cls,
files,
max_size=None,
min_freq=0,
lower_case=True,
delimiter=None,
unk_token=None,
pad_token=None,
bos_token=None,
eos_token=None,
**kwargs):
return Vocab.build_vocab(
cls.data_iterator(
files=files, delimiter=delimiter, lower_case=lower_case),
max_size=max_size,
min_freq=min_freq,
unk_token=unk_token,
pad_token=pad_token,
bos_token=bos_token,
eos_token=eos_token)
@classmethod
def tokenize(cls, line, delimiter=None, lower_case=True):
line = line.strip()
if lower_case:
line = line.lower()
tokens = list(line) if delimiter == "" else line.split(delimiter)
return tokens
@classmethod
def data_iterator(cls, files, delimiter=None, lower_case=True):
if isinstance(files, str):
files = [files]
elif not isinstance(files, (list, tuple)):
raise ValueError(
"The parameter files must be a str or a list/tuple.")
for fl in files:
assert os.path.exists(fl), "%s is not exist. " % fl
with open(fl, 'r', encoding='utf-8') as f:
for line in f:
tokens = cls.tokenize(
line=line, delimiter=delimiter, lower_case=lower_case)
yield tokens
def get_lm_data_loader(args, vocab, mode="train"):
lm_dataset = LMDataset(
mode=mode,
vocab=vocab,
path=args.data,
dataset_name=args.dataset,
batch_size=args.batch_size if mode == "train" else args.eval_batch_size,
bptt=args.tgt_len,
ext_len=args.ext_len,
nranks=dist.get_world_size() if mode == "train" else 1,
rank=dist.get_rank() if mode == "train" else 0)
data_loader = DataLoader(
dataset=lm_dataset, batch_size=None, num_workers=0, return_list=True)
return data_loader
def get_lm_vocab(args):
kwargs = {"unk_token": "<unk>"}
if args.token_delimiter == "None":
kwargs["delimiter"] = None
else:
kwargs["delimiter"] = args.token_delimiter
if args.dataset == "wt103":
kwargs["eos_token"] = "<eos>"
kwargs["lower_case"] = False
if args.dataset in ["enwik8", "text8"]:
files = [
os.path.join(args.data, "train.txt"),
os.path.join(args.data, "valid.txt"),
os.path.join(args.data, "test.txt")
]
elif args.dataset == "wt103":
files = [os.path.join(args.data, "train.txt")]
else:
raise ValueError("Not supported dataset yet. ")
vocab = LMDataset.get_vocab(files, **kwargs)
args.ntokens = len(vocab)
print("Finish processing vocabulary, and the size of vocabulary is {}".
format(args.ntokens))
return vocab
import os
import time
import yaml
import logging
import argparse
import numpy as np
from pprint import pprint
from attrdict import AttrDict
import paddle
import paddle.nn as nn
import paddle.distributed as dist
from mem_transformer import MemTransformerLM
from reader import get_lm_vocab, get_lm_data_loader
FORMAT = '%(asctime)s-%(levelname)s: %(message)s'
logging.basicConfig(level=logging.INFO, format=FORMAT)
logger = logging.getLogger(__name__)
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument(
"--config",
default="./configs/enwik8.yaml",
type=str,
help="Path of the config file. ")
args = parser.parse_args()
return args
def do_train(args):
if args.use_gpu:
rank = dist.get_rank()
trainer_count = dist.get_world_size()
else:
rank = 0
trainer_count = 1
if trainer_count > 1:
dist.init_parallel_env()
random_seed = eval(str(args.random_seed))
if random_seed is not None:
paddle.seed(random_seed)
vocab = get_lm_vocab(args)
train_loader = get_lm_data_loader(args, vocab, "train")
eval_loader = get_lm_data_loader(args, vocab, "valid")
cutoffs, tie_projs = [], [False]
if args.adaptive:
assert args.dataset in ['wt103', 'lm1b']
if args.dataset == 'wt103':
cutoffs = [20000, 40000, 200000]
tie_projs += [True] * len(cutoffs)
elif args.dataset == 'lm1b':
cutoffs = [60000, 100000, 640000]
tie_projs += [False] * len(cutoffs)
mem_transformer = MemTransformerLM(
args.ntokens,
args.n_layer,
args.n_head,
args.d_model,
args.d_head,
args.d_inner_hid,
args.dropout,
args.attn_dropout,
tie_weight=args.tie_weight,
d_embed=args.d_model,
div_val=args.div_val,
tie_projs=tie_projs,
normalize_before=args.normalize_before,
tgt_len=args.tgt_len,
ext_len=args.ext_len,
mem_len=args.mem_len,
cutoffs=cutoffs,
same_length=args.same_length,
attn_type=args.attn_type,
clamp_len=args.clamp_len,
sample_softmax=args.sample_softmax)
if args.scheduler == 'cosine':
scheduler = paddle.optimizer.lr.CosineAnnealingDecay(
learning_rate=args.learning_rate,
T_max=args.max_step,
eta_min=args.eta_min)
elif args.scheduler == 'noam':
scheduler = paddle.optimizer.lr.NoamDecay(
d_model=args.d_model,
warmup_steps=args.warmup_steps,
learning_rate=args.learning_rate)
elif args.scheduler == 'dev_perf':
# fluid api
scheduler = paddle.fluid.dygraph.ReduceLROnPlateau(
learning_rate=args.learning_rate,
decay_rate=args.decay_rate,
patience=args.patience,
min_lr=args.lr_min)
elif args.scheduler == 'constant':
scheduler = args.learning_rate
clip = paddle.nn.ClipGradByGlobalNorm(args.clip)
if args.optim.lower() == 'momentum':
optimizer = paddle.optimizer.Momentum(
learning_rate=scheduler,
parameters=mem_transformer.parameters(),
momentum=args.mom,
grad_clip=clip)
elif args.optim.lower() == 'adam':
optimizer = paddle.optimizer.Adam(
learning_rate=scheduler,
parameters=mem_transformer.parameters(),
beta1=args.beta1,
beta2=args.beta2,
epsilon=eval(args.eps),
grad_clip=clip)
elif args.optim.lower() == 'adagrad':
optimizer = paddle.optimizer.Adagrad(
learning_rate=scheduler,
parameters=mem_transformer.parameters(),
grad_clip=clip)
# Init from some checkpoint, to resume the previous training
if args.init_from_checkpoint:
model_dict = paddle.load(
os.path.join(args.init_from_checkpoint, "mem_transformer.pdparams"))
opt_dict = paddle.load(
os.path.join(args.init_from_checkpoint, "mem_transformer.pdopt"))
mem_transformer.set_state_dict(model_dict)
optimizer.set_state_dict(opt_dict)
print("loaded from checkpoint.")
# Init from some pretrain models, to better solve the current task
if args.init_from_pretrain_model:
model_dict = paddle.load(
os.path.join(args.init_from_pretrain_model,
"mem_transformer.pdparams"))
mem_transformer.set_state_dict(model_dict)
print("loaded from pre-trained model.")
if trainer_count > 1:
mem_transformer = paddle.DataParallel(mem_transformer)
step_idx = 0
train_loss = 0.0
log_start_time = time.time()
for pass_id in range(args.epoch):
batch_id = 0
mems = tuple()
for input_data in train_loader:
(src, target, seq_len) = input_data
ret = mem_transformer(src, target, *mems)
loss = ret[0]
mems = ret[1:]
train_loss += loss.numpy()
loss.backward()
optimizer.step()
optimizer.clear_grad()
if step_idx > 0 and step_idx % args.print_step == 0 and rank == 0:
cur_loss = train_loss / args.print_step
elapsed = time.time() - log_start_time
if args.scheduler == "constant":
lr = optimizer.get_lr()
else:
lr = scheduler.get_lr()
logger_info = "step_idx: %d, epoch: %d, batch: %d, learning rate: %.8f, " \
"speed: %f ms/batch, loss: %f" % \
(step_idx, pass_id, batch_id, lr,
elapsed * 1000.0 / args.print_step, cur_loss)
if args.dataset in ["enwik8", "text8"]:
logger_info = logger_info + ", bpc: %f" % (cur_loss /
np.log(2))
else:
logger_info = logger_info + ", ppl: %f" % (np.exp(cur_loss))
logger.info(logger_info)
train_loss = 0.0
log_start_time = time.time()
if step_idx % args.save_step == 0 and step_idx != 0:
# Do validation.
mem_transformer.eval()
# TODO(FrostML): simplify this.
if args.mem_len == 0:
if dist.get_world_size() == 1:
mem_transformer.reset_length(
tgt_len=args.eval_tgt_len,
ext_len=args.ext_len + args.tgt_len -
args.eval_tgt_len,
mem_len=args.mem_len)
else:
mem_transformer._layers.reset_length(
tgt_len=args.eval_tgt_len,
ext_len=args.ext_len + args.tgt_len -
args.eval_tgt_len,
mem_len=args.mem_len)
else:
if dist.get_world_size() == 1:
mem_transformer.reset_length(
tgt_len=args.eval_tgt_len,
ext_len=args.ext_len,
mem_len=args.mem_len + args.tgt_len -
args.eval_tgt_len)
else:
mem_transformer._layers.reset_length(
tgt_len=args.eval_tgt_len,
ext_len=args.ext_len,
mem_len=args.mem_len + args.tgt_len -
args.eval_tgt_len)
total_len, total_loss = 0, 0.
eval_mems = tuple()
with paddle.no_grad():
for i, (src, target, seq_len) in enumerate(eval_loader):
if args.max_eval_steps > 0 and i >= args.max_eval_steps:
break
ret = mem_transformer(src, target, *eval_mems)
loss, eval_mems = ret[0], ret[1:]
seq_len = seq_len.numpy()
eval_cur_loss = seq_len * loss.numpy()
total_loss += eval_cur_loss
total_len += seq_len
eval_loss = total_loss / total_len
logger_info = "Validation, step_idx: %d, validation loss: %f" % \
(step_idx, eval_loss)
if args.dataset in ['enwik8', 'text8']:
logger_info = logger_info + ", bpc: %f" % (eval_loss /
np.log(2))
else:
logger_info = logger_info + ", ppl: %f" % (np.exp(eval_loss)
)
logger.info(logger_info)
if args.save_model and rank == 0:
model_dir = os.path.join(args.save_model,
"step_" + str(step_idx))
if not os.path.exists(model_dir):
os.makedirs(model_dir)
paddle.save(
mem_transformer.state_dict(),
os.path.join(model_dir, "mem_transformer.pdparams"))
paddle.save(
optimizer.state_dict(),
os.path.join(model_dir, "mem_transformer.pdopt"))
if args.scheduler == 'dev_perf':
scheduler.step(eval_loss)
# TODO(FrostML): simplify this.
if dist.get_world_size() == 1:
mem_transformer.reset_length(
tgt_len=args.tgt_len,
ext_len=args.ext_len,
mem_len=args.mem_len)
else:
mem_transformer._layers.reset_length(
tgt_len=args.tgt_len,
ext_len=args.ext_len,
mem_len=args.mem_len)
mem_transformer.train()
step_idx += 1
batch_id += 1
if args.scheduler in ['cosine', 'dev_perf']:
if step_idx < args.warmup_steps:
curr_lr = args.learning_rate * step_idx / args.warmup_steps
scheduler.base_lr = curr_lr
else:
if args.scheduler == 'cosine':
scheduler.step()
elif args.scheduler == 'constant':
if step_idx < args.warmup_steps:
curr_lr = args.learning_rate * step_idx / args.warmup_steps
optimizer.set_lr(curr_lr)
elif args.scheduler == 'noam':
scheduler.step()
if step_idx >= args.max_step:
break
if args.save_model and rank == 0:
model_dir = os.path.join(args.save_model, "step_final")
if not os.path.exists(model_dir):
os.makedirs(model_dir)
paddle.save(mem_transformer.state_dict(),
os.path.join(model_dir, "mem_transformer.pdparams"))
paddle.save(optimizer.state_dict(),
os.path.join(model_dir, "mem_transformer.pdopt"))
if __name__ == "__main__":
ARGS = parse_args()
yaml_file = ARGS.config
with open(yaml_file, 'rt') as f:
args = AttrDict(yaml.safe_load(f))
pprint(args)
do_train(args)
import sys
import zipfile
import argparse
if __name__ == "__main__":
data = zipfile.ZipFile("text8.zip").extractall()
data = open("text8", "r", encoding="utf-8").read()
num_test_char = int(sys.argv[1])
train_data = data[:-2 * num_test_char]
valid_data = data[-2 * num_test_char:-num_test_char]
test_data = data[-num_test_char:]
for files, data in [("train.txt", train_data), ("valid.txt", valid_data),
("test.txt", test_data)]:
data_str = " ".join(["_" if c == " " else c for c in data.strip()])
with open(files, "w") as f:
f.write(data_str)
with open(files + ".raw", "w", encoding="utf-8") as fw:
fw.write(data)
......@@ -27,7 +27,7 @@ DuReader-robust数据集是单篇章、抽取式阅读理解数据集,具体
* PaddlePaddle 安装
本项目依赖于 PaddlePaddle 2.0 及以上版本,请参考 [安装指南](http://www.paddlepaddle.org/#quick-start) 进行安装
本项目依赖于 PaddlePaddle 2.0-rc1 及以上版本,请参考 [安装指南](http://www.paddlepaddle.org/#quick-start) 进行安装
* PaddleNLP 安装
......
......@@ -181,7 +181,7 @@ def do_train(args):
args.learning_rate,
lambda current_step, warmup_proportion=args.warmup_proportion,
num_training_steps=args.max_steps if args.max_steps > 0 else
(len(train_ds.examples)//args.batch_size*args.num_train_epochs): float(
(len(train_data_loader)*args.num_train_epochs): float(
current_step) / float(max(1, warmup_proportion*num_training_steps))
if current_step < warmup_proportion*num_training_steps else max(
0.0,
......
......@@ -41,7 +41,7 @@
* PaddlePaddle 安装
本项目依赖于 PaddlePaddle 2.0 及以上版本,请参考 [安装指南](http://www.paddlepaddle.org/#quick-start) 进行安装
本项目依赖于 PaddlePaddle 2.0-rc1 及以上版本,请参考 [安装指南](http://www.paddlepaddle.org/#quick-start) 进行安装
* PaddleNLP 安装
......
......@@ -27,7 +27,7 @@ SQuAD v2.0
* PaddlePaddle 安装
本项目依赖于 PaddlePaddle 2.0 及以上版本,请参考 [安装指南](http://www.paddlepaddle.org/#quick-start) 进行安装
本项目依赖于 PaddlePaddle 2.0-rc1 及以上版本,请参考 [安装指南](http://www.paddlepaddle.org/#quick-start) 进行安装
* PaddleNLP 安装
......@@ -56,7 +56,7 @@ python -u ./run_squad.py \
--batch_size 12 \
--learning_rate 3e-5 \
--num_train_epochs 2 \
--logging_steps 1000 \
--logging_steps 100 \
--save_steps 1000 \
--warmup_proportion 0.1 \
--weight_decay 0.01 \
......
......@@ -19,7 +19,7 @@ Sequence to Sequence (Seq2Seq),使用编码器-解码器(Encoder-Decoder)
本目录包含Seq2Seq的一个经典样例:机器翻译,带attention机制的翻译模型。Seq2Seq翻译模型,模拟了人类在进行翻译类任务时的行为:先解析源语言,理解其含义,再根据该含义来写出目标语言的语句。更多关于机器翻译的具体原理和数学表达式,我们推荐参考飞桨官网[机器翻译案例](https://www.paddlepaddle.org.cn/documentation/docs/zh/user_guides/nlp_case/machine_translation/README.cn.html)
运行本目录下的范例模型需要安装PaddlePaddle 2.0-rc。如果您的 PaddlePaddle 安装版本低于此要求,请按照[安装文档](https://www.paddlepaddle.org.cn/#quick-start)中的说明更新 PaddlePaddle 安装版本。
运行本目录下的范例模型需要安装PaddlePaddle 2.0-rc1及以上版本。如果您的 PaddlePaddle 安装版本低于此要求,请按照[安装文档](https://www.paddlepaddle.org.cn/#quick-start)中的说明更新 PaddlePaddle 安装版本。
## 模型概览
......
......@@ -24,9 +24,15 @@ use_gpu: True
# Args for reader, see reader.py for details
pool_size: 200000
sort_type: "pool"
sort_type: "global"
batch_size: 4096
infer_batch_size: 8
shuffle_batch: True
# Data shuffle only works when sort_type is pool or none
shuffle: True
# shuffle_seed must be set when shuffle is True and using multi-cards to train.
# Otherwise, the number of batches cannot be guaranteed.
shuffle_seed: 128
# Hyparams for training:
# The number of epoches for training
......
......@@ -24,9 +24,15 @@ use_gpu: True
# Args for reader, see reader.py for details
pool_size: 200000
sort_type: "pool"
sort_type: "global"
batch_size: 4096
infer_batch_size: 8
shuffle_batch: True
# Data shuffle only works when sort_type is pool or none
shuffle: True
# shuffle_seed must be set when shuffle is True and using multi-cards to train.
# Otherwise, the number of batches cannot be guaranteed.
shuffle_seed: 128
# Hyparams for training:
# The number of epoches for training
......
......@@ -43,6 +43,12 @@ def create_data_loader(args):
mode=m, transform_func=transform_func) for m in ["train", "dev"]
]
if args.shuffle or args.shuffle_batch:
if args.shuffle_seed == "None" or args.shuffle_seed is None:
shuffle_seed = 0
else:
shuffle_seed = args.shuffle_seed
def _max_token_fn(current_idx, current_batch_size, tokens_sofar,
data_source):
return max(tokens_sofar,
......@@ -60,19 +66,17 @@ def create_data_loader(args):
min_max_filer, max_len=args.max_length))
sampler = SamplerHelper(dataset)
src_key = (lambda x, data_source: len(data_source[x][0]) + 1)
if args.sort_type == SortType.GLOBAL:
buffer_size = -1
src_key = (lambda x, data_source: len(data_source[x][0]) + 1)
trg_key = (lambda x, data_source: len(data_source[x][1]) + 1)
# Sort twice
sampler = sampler.sort(
key=trg_key, buffer_size=buffer_size).sort(
key=src_key, buffer_size=buffer_size)
sampler = sampler.sort(key=trg_key).sort(key=src_key)
else:
sampler = sampler.shuffle()
if args.shuffle:
sampler = sampler.shuffle(seed=shuffle_seed)
max_key = (lambda x, data_source: max(len(data_source[x][0]), len(data_source[x][1])) + 1)
if args.sort_type == SortType.POOL:
buffer_size = args.pool_size
sampler = sampler.sort(key=src_key, buffer_size=buffer_size)
sampler = sampler.sort(key=max_key, buffer_size=args.pool_size)
batch_sampler = sampler.batch(
batch_size=args.batch_size,
......@@ -80,6 +84,9 @@ def create_data_loader(args):
batch_size_fn=_max_token_fn,
key=_key)
if args.shuffle_batch:
batch_sampler.shuffle(seed=shuffle_seed)
if m == "train":
batch_sampler = batch_sampler.shard()
......
......@@ -4,7 +4,7 @@ BERT-base模型是一个迁移能力很强的通用语义表示模型,但是
## 压缩结果
基于`bert-base-uncased` 在GLUE dev数据集上的finetune结果进行压缩。压缩后模型精度和压缩前模型在GLUE dev数据集上的精度对比如下表所示, 压缩后模型相比压缩前加速约2倍,模型参数大小减小26%(从110M减少到81M)。
基于`bert-base-uncased` 在GLUE dev数据集上的finetune结果进行压缩。压缩后模型精度和压缩前模型在GLUE dev数据集上的精度对比如下表所示:
| Task | Metric | Result | Result with PaddleSlim |
|:-----:|:----------------------------:|:-----------------:|:----------------------:|
......@@ -17,6 +17,7 @@ BERT-base模型是一个迁移能力很强的通用语义表示模型,但是
| MNLI | Matched acc/MisMatched acc | 0.84422/0.84825 | 0.84687/0.85242 |
| RTE | Accuracy | 0.711191 | 0.718412 |
压缩后模型相比压缩前加速约59%(测试环境: T4, FP32, batch_size=16),模型参数大小减小26%(从110M减少到81M)。
## 快速开始
本教程示例以GLUE/SST-2 数据集为例。
......@@ -86,7 +87,7 @@ python -u ./run_glue_ofa.py --model_type bert \
- `n_gpu` 表示使用的 GPU 卡数。若希望使用多卡训练,将其设置为指定数目即可;若为0,则使用CPU。
- `width_mult_list` 表示压缩训练过程中,对每层Transformer Block的宽度选择的范围。
压缩训练之后在dev上的结果如压缩结果表格中Result with PaddleSlim那一列所示, 速度相比原始模型加速2倍
压缩训练之后在dev上的结果如压缩结果表格中Result with PaddleSlim那一列所示,速度相比原始模型加速59%
## 压缩原理
......
......@@ -124,3 +124,9 @@ python -u ./predict.py \
year={2020}
}
```
## 线上教程体验
我们为诗歌文本生成提供了线上教程,欢迎体验:
* [使用PaddleNLP预训练模型ERNIE-GEN生成诗歌](https://aistudio.baidu.com/aistudio/projectdetail/1339888)
运行本目录下的范例模型需要安装PaddlePaddle 2.0-rc。如果您的 PaddlePaddle 安装版本低于此要求,请按照[安装文档](https://www.paddlepaddle.org.cn/#quick-start)中的说明更新 PaddlePaddle 安装版本。
运行本目录下的范例模型需要安装PaddlePaddle 2.0-rc1及以上版本。如果您的 PaddlePaddle 安装版本低于此要求,请按照[安装文档](https://www.paddlepaddle.org.cn/#quick-start)中的说明更新 PaddlePaddle 安装版本。
# Variational Autoencoder (VAE) for Text Generation
以下是本范例模型的简要目录结构及说明:
......@@ -15,7 +15,7 @@
```
## 简介
本目录下此范例模型的实现,旨在展示如何用Paddle 2.0-rc 构建用于文本生成的VAE示例,其中LSTM作为编码器和解码器。分别对官方PTB数据和yahoo数据集进行训练。
本目录下此范例模型的实现,旨在展示如何用Paddle构建用于文本生成的VAE示例,其中LSTM作为编码器和解码器。分别对官方PTB数据和yahoo数据集进行训练。
关于VAE的详细介绍参照: [(Bowman et al., 2015) Generating Sentences from a Continuous Space](https://arxiv.org/pdf/1511.06349.pdf)
......
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册