提交 af85268f 编写于 作者: D dyonghan 提交者: leiyuning

!1 Initial version

Initial version
上级 0c2d4f04
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
pip-wheel-metadata/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
.python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# system file
.DS_Store
.swap
# IDE
.idea/
# course
#### Description
{**When you're done, you can delete the content in this README and update the file with details for others getting started with your repository**}
The experimental guidance based on the MindSpore open source deep learning framework. It is only used for teaching or training purposes.
#### Software Architecture
Software architecture description
Part of the content comes from the open source community, internet or third party. If something violates your rights, please leave a message via issue or submit a pull request.
#### Installation
1. xxxx
2. xxxx
3. xxxx
#### Instructions
1. xxxx
2. xxxx
3. xxxx
#### Contribution
1. Fork the repository
2. Create Feat_xxx branch
3. Commit your code
4. Create Pull Request
#### Gitee Feature
1. You can use Readme\_XXX.md to support different languages, such as Readme\_en.md, Readme\_zh.md
2. Gitee blog [blog.gitee.com](https://blog.gitee.com)
3. Explore open source project [https://gitee.com/explore](https://gitee.com/explore)
4. The most valuable open source project [GVP](https://gitee.com/gvp)
5. The manual of Gitee [https://gitee.com/help](https://gitee.com/help)
6. The most popular members [https://gitee.com/gitee-stars/](https://gitee.com/gitee-stars/)
Please go to [MindSpore Open Source Community] (https://www.mindspore.cn/) for more videos and documentation tutorials.
# course
#### 介绍
{**以下是码云平台说明,您可以替换此简介**
码云是 OSCHINA 推出的基于 Git 的代码托管平台(同时支持 SVN)。专为开发者提供稳定、高效、安全的云端软件开发协作平台
无论是个人、团队、或是企业,都能够用码云实现代码托管、项目管理、协作开发。企业项目请看 [https://gitee.com/enterprises](https://gitee.com/enterprises)}
基于MindSpore开源深度学习框架的实验指导,仅用于教学或培训目的。
#### 软件架构
软件架构说明
部分内容来源于开源社区、网络或第三方。如果有内容侵犯了您的权力,请通过issue留言,或者提交pull request。
#### 安装教程
1. xxxx
2. xxxx
3. xxxx
#### 使用说明
1. xxxx
2. xxxx
3. xxxx
#### 参与贡献
1. Fork 本仓库
2. 新建 Feat_xxx 分支
3. 提交代码
4. 新建 Pull Request
#### 码云特技
1. 使用 Readme\_XXX.md 来支持不同的语言,例如 Readme\_en.md, Readme\_zh.md
2. 码云官方博客 [blog.gitee.com](https://blog.gitee.com)
3. 你可以 [https://gitee.com/explore](https://gitee.com/explore) 这个地址来了解码云上的优秀开源项目
4. [GVP](https://gitee.com/gvp) 全称是码云最有价值开源项目,是码云综合评定出的优秀开源项目
5. 码云官方提供的使用手册 [https://gitee.com/help](https://gitee.com/help)
6. 码云封面人物是一档用来展示码云会员风采的栏目 [https://gitee.com/gitee-stars/](https://gitee.com/gitee-stars/)
请前往[MindSpore开源社区](https://www.mindspore.cn/)获取更多视频和文档教程。
此差异已折叠。
# LeNet5 mnist
import os
# os.environ['DEVICE_ID'] = '0'
# Log level includes 3(ERROR), 2(WARNING), 1(INFO), 0(DEBUG).
os.environ['GLOG_v'] = '1'
import matplotlib.pyplot as plt
import numpy as np
import mindspore as ms
import mindspore.context as context
import mindspore.dataset.transforms.c_transforms as C
import mindspore.dataset.transforms.vision.c_transforms as CV
from mindspore.dataset.transforms.vision import Inter
from mindspore import nn, Tensor
from mindspore.train import Model
from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor
from mindspore.train.serialization import load_checkpoint, load_param_into_net
context.set_context(mode=context.GRAPH_MODE, device_target='Ascend')
DATA_DIR_TRAIN = "MNIST/train" # 训练集信息
DATA_DIR_TEST = "MNIST/test" # 测试集信息
def create_dataset(training=True, num_epoch=1, batch_size=32, resize=(32, 32),
rescale=1/(255*0.3081), shift=-0.1307/0.3081, buffer_size=64):
ds = ms.dataset.MnistDataset(DATA_DIR_TRAIN if training else DATA_DIR_TEST)
# define map operations
resize_op = CV.Resize(resize)
rescale_op = CV.Rescale(rescale, shift)
hwc2chw_op = CV.HWC2CHW()
# apply map operations on images
ds = ds.map(input_columns="image", operations=[resize_op, rescale_op, hwc2chw_op])
ds = ds.map(input_columns="label", operations=C.TypeCast(ms.int32))
ds = ds.shuffle(buffer_size=buffer_size)
ds = ds.batch(batch_size, drop_remainder=True)
ds = ds.repeat(num_epoch)
return ds
class LeNet(nn.Cell):
def __init__(self):
super(LeNet, self).__init__()
self.relu = nn.ReLU()
self.conv1 = nn.Conv2d(1, 6, 5, stride=1, pad_mode='valid')
self.conv2 = nn.Conv2d(6, 16, 5, stride=1, pad_mode='valid')
self.pool = nn.MaxPool2d(kernel_size=2, stride=2)
self.flatten = nn.Flatten()
self.fc1 = nn.Dense(400, 120)
self.fc2 = nn.Dense(120, 84)
self.fc3 = nn.Dense(84, 10)
def construct(self, input_x):
output = self.conv1(input_x)
output = self.relu(output)
output = self.pool(output)
output = self.conv2(output)
output = self.relu(output)
output = self.pool(output)
output = self.flatten(output)
output = self.fc1(output)
output = self.fc2(output)
output = self.fc3(output)
return output
LOOP_SINK = context.get_context('enable_loop_sink')
def test_train(lr=0.01, momentum=0.9, num_epoch=3, ckpt_name="a_lenet"):
ds_train = create_dataset(num_epoch=num_epoch)
ds_eval = create_dataset(training=False)
steps_per_epoch = ds_train.get_dataset_size()
net = LeNet()
loss = nn.loss.SoftmaxCrossEntropyWithLogits(is_grad=False, sparse=True, reduction='mean')
opt = nn.Momentum(net.trainable_params(), lr, momentum)
ckpt_cfg = CheckpointConfig(save_checkpoint_steps=steps_per_epoch, keep_checkpoint_max=5)
ckpt_cb = ModelCheckpoint(prefix=ckpt_name, config=ckpt_cfg)
loss_cb = LossMonitor(per_print_times=1 if LOOP_SINK else steps_per_epoch)
model = Model(net, loss, opt, metrics={'acc', 'loss'})
model.train(num_epoch, ds_train, callbacks=[ckpt_cb, loss_cb], dataset_sink_mode=True)
metrics = model.eval(ds_eval)
print('Metrics:', metrics)
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('--data_url', required=True, default=None, help='Location of data.')
parser.add_argument('--train_url', required=True, default=None, help='Location of training outputs.')
parser.add_argument('--num_epochs', type=int, default=1, help='Number of training epochs.')
args, unknown = parser.parse_known_args()
import moxing as mox
mox.file.copy_parallel(src_url=args.data_url, dst_url='MNIST/')
os.system('rm -f *.ckpt *.ir *.meta') # 清理旧的运行文件
test_train()
此差异已折叠。
# Save and load model
import os
# os.environ['DEVICE_ID'] = '0'
# Log level includes 3(ERROR), 2(WARNING), 1(INFO), 0(DEBUG).
os.environ['GLOG_v'] = '2'
import matplotlib.pyplot as plt
import numpy as np
import mindspore as ms
import mindspore.context as context
import mindspore.dataset.transforms.c_transforms as C
import mindspore.dataset.transforms.vision.c_transforms as CV
from mindspore.dataset.transforms.vision import Inter
from mindspore import nn, Tensor
from mindspore.train import Model
from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor
from mindspore.train.serialization import load_checkpoint, load_param_into_net
import logging; logging.getLogger('matplotlib.font_manager').disabled = True
context.set_context(mode=context.GRAPH_MODE, device_target='Ascend')
DATA_DIR_TRAIN = "MNIST/train" # 训练集信息
DATA_DIR_TEST = "MNIST/test" # 测试集信息
def create_dataset(training=True, num_epoch=1, batch_size=32, resize=(32, 32),
rescale=1/(255*0.3081), shift=-0.1307/0.3081, buffer_size=64):
ds = ms.dataset.MnistDataset(DATA_DIR_TRAIN if training else DATA_DIR_TEST)
# define map operations
resize_op = CV.Resize(resize)
rescale_op = CV.Rescale(rescale, shift)
hwc2chw_op = CV.HWC2CHW()
# apply map operations on images
ds = ds.map(input_columns="image", operations=[resize_op, rescale_op, hwc2chw_op])
ds = ds.map(input_columns="label", operations=C.TypeCast(ms.int32))
ds = ds.shuffle(buffer_size=buffer_size)
ds = ds.batch(batch_size, drop_remainder=True)
ds = ds.repeat(num_epoch)
return ds
class LeNet(nn.Cell):
def __init__(self):
super(LeNet, self).__init__()
self.relu = nn.ReLU()
self.conv1 = nn.Conv2d(1, 6, 5, stride=1, pad_mode='valid')
self.conv2 = nn.Conv2d(6, 16, 5, stride=1, pad_mode='valid')
self.pool = nn.MaxPool2d(kernel_size=2, stride=2)
self.flatten = nn.Flatten()
self.fc1 = nn.Dense(400, 120)
self.fc2 = nn.Dense(120, 84)
self.fc3 = nn.Dense(84, 10)
def construct(self, input_x):
output = self.conv1(input_x)
output = self.relu(output)
output = self.pool(output)
output = self.conv2(output)
output = self.relu(output)
output = self.pool(output)
output = self.flatten(output)
output = self.fc1(output)
output = self.fc2(output)
output = self.fc3(output)
return output
LOOP_SINK = context.get_context('enable_loop_sink')
def test_train(lr=0.01, momentum=0.9, num_epoch=2, check_point_name="b_lenet"):
ds_train = create_dataset(num_epoch=num_epoch)
ds_eval = create_dataset(training=False)
steps_per_epoch = ds_train.get_dataset_size()
net = LeNet()
loss = nn.loss.SoftmaxCrossEntropyWithLogits(is_grad=False, sparse=True, reduction='mean')
opt = nn.Momentum(net.trainable_params(), lr, momentum)
ckpt_cfg = CheckpointConfig(save_checkpoint_steps=steps_per_epoch, keep_checkpoint_max=5)
ckpt_cb = ModelCheckpoint(prefix=check_point_name, config=ckpt_cfg)
loss_cb = LossMonitor(per_print_times=1 if LOOP_SINK else steps_per_epoch)
model = Model(net, loss, opt, metrics={'acc', 'loss'})
model.train(num_epoch, ds_train, callbacks=[ckpt_cb, loss_cb], dataset_sink_mode=True)
metrics = model.eval(ds_eval)
print('Metrics:', metrics)
CKPT = 'b_lenet-2_1875.ckpt'
def resume_train(lr=0.001, momentum=0.9, num_epoch=2, ckpt_name="b_lenet"):
ds_train = create_dataset(num_epoch=num_epoch)
ds_eval = create_dataset(training=False)
steps_per_epoch = ds_train.get_dataset_size()
net = LeNet()
loss = nn.loss.SoftmaxCrossEntropyWithLogits(is_grad=False, sparse=True, reduction='mean')
opt = nn.Momentum(net.trainable_params(), lr, momentum)
param_dict = load_checkpoint(CKPT)
load_param_into_net(net, param_dict)
load_param_into_net(opt, param_dict)
ckpt_cfg = CheckpointConfig(save_checkpoint_steps=steps_per_epoch, keep_checkpoint_max=5)
ckpt_cb = ModelCheckpoint(prefix=ckpt_name, config=ckpt_cfg)
loss_cb = LossMonitor(per_print_times=1 if LOOP_SINK else steps_per_epoch)
model = Model(net, loss, opt, metrics={'acc', 'loss'})
model.train(num_epoch, ds_train, callbacks=[ckpt_cb, loss_cb])
metrics = model.eval(ds_eval)
print('Metrics:', metrics)
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('--data_url', required=True, default=None, help='Location of data.')
parser.add_argument('--train_url', required=True, default=None, help='Location of training outputs.')
parser.add_argument('--num_epochs', type=int, default=1, help='Number of training epochs.')
args, unknown = parser.parse_known_args()
import moxing as mox
mox.file.copy_parallel(src_url=args.data_url, dst_url='MNIST/')
os.system('rm -f *.ckpt *.ir *.meta') # 清理旧的运行文件
test_train()
print('\n'.join(sorted([x for x in os.listdir('.') if x.startswith('b_lenet')])))
resume_train()
print('\n'.join(sorted([x for x in os.listdir('.') if x.startswith('b_lenet')])))
\ No newline at end of file
<h1 style="text-align:center">计算机视觉应用</h1>
[TOC]
## 实验介绍
本实验主要介绍使用MindSpore在CIFAR10数据集上训练ResNet50。本实验建议使用MindSpore model_zoo中提供的ResNet50。
## 实验目的
- 了解如何使用MindSpore加载常用的CIFAR-10图片分类数据集。
- 了解MindSpore的model_zoo模块,以及如何使用model_zoo中的模型。
- 了解ResNet50这类大模型的基本结构和编程方法。
## 预备知识
- 熟练使用Python,了解Shell及Linux操作系统基本知识。
- 具备一定的深度学习理论知识,如卷积神经网络、损失函数、优化器,训练策略、Checkpoint等。
- 了解华为云的基本使用方法,包括[OBS(对象存储)](https://www.huaweicloud.com/product/obs.html)[ModelArts(AI开发平台)](https://www.huaweicloud.com/product/modelarts.html)[训练作业](https://support.huaweicloud.com/engineers-modelarts/modelarts_23_0046.html)等功能。华为云官网:https://www.huaweicloud.com
- 了解并熟悉MindSpore AI计算框架,MindSpore官网:https://www.mindspore.cn/
## 实验环境
- MindSpore 0.2.0(MindSpore版本会定期更新,本指导也会定期刷新,与版本配套);
- 华为云ModelArts:ModelArts是华为云提供的面向开发者的一站式AI开发平台,集成了昇腾AI处理器资源池,用户可以在该平台下体验MindSpore。ModelArts官网:https://www.huaweicloud.com/product/modelarts.html
## 实验准备
### 创建OBS桶
本实验需要使用华为云OBS存储脚本和数据集,可以参考[快速通过OBS控制台上传下载文件](https://support.huaweicloud.com/qs-obs/obs_qs_0001.html)了解使用OBS创建桶、上传文件、下载文件的使用方法。
> **提示:**华为云新用户使用OBS时通常需要创建和配置“访问密钥”,可以在使用OBS时根据提示完成创建和配置。也可以参考[获取访问密钥并完成ModelArts全局配置](https://support.huaweicloud.com/prepare-modelarts/modelarts_08_0002.html)获取并配置访问密钥。
创建OBS桶的参考配置如下:
- 区域:华北-北京四
- 数据冗余存储策略:单AZ存储
- 桶名称:如ms-course
- 存储类别:标准存储
- 桶策略:公共读
- 归档数据直读:关闭
- 企业项目、标签等配置:免
### 数据集准备
CIFAR-10是一个图片分类数据集,包含60000张32x32的彩色物体图片,训练集50000张,测试集10000张,共10类,每类6000张。CIFAR-10数据集的官网:[THE MNIST DATABASE](http://www.cs.toronto.edu/~kriz/cifar.html)
从CIFAR-10官网下载“CIFAR-10 binary version (suitable for C programs)”到本地并解压。
### 脚本准备
[MindSpore tutorial仓库](https://gitee.com/mindspore/docs/tree/r0.2/tutorials/tutorial_code/sample_for_cloud/)里下载相关脚本。
### 上传文件
将脚本和数据集上传到OBS桶中,组织为如下形式:
```
experiment_3
├── 脚本等文件
└── cifar10
├── batches.meta.txt
├── test
│   └── test_batch.bin
└── train
├── data_batch_1.bin
├── data_batch_2.bin
├── data_batch_3.bin
├── data_batch_4.bin
└── data_batch_5.bin
```
## 实验步骤
参考MindSpore官网[计算机视觉应用](https://www.mindspore.cn/tutorial/zh-CN/0.1.0-alpha/advanced_use/computer_vision_application.html)教程,使用MindSpore在CIFAR10数据集上训练ResNet50,并进行验证。建议:
- 使用单卡训练即可;
- 理解并熟悉教程中涉及的源码;
- 使用MindSpore model_zoo中提供的ResNet50。
### 代码梳理
- resnet50_train.py:主脚本,包含性能测试`PerformanceCallback`、动态学习率`get_lr`、执行函数`resnet50_train`等函数;
- dataset.py:数据处理脚本。
`PerformanceCallback`继承MindSpore Callback类,并统计每个训练step的时延:
```python
class PerformanceCallback(Callback):
"""
Training performance callback.
Args:
batch_size (int): Batch number for one step.
"""
def __init__(self, batch_size):
super(PerformanceCallback, self).__init__()
self.batch_size = batch_size
self.last_step = 0
self.epoch_begin_time = 0
def step_begin(self, run_context):
self.epoch_begin_time = time.time()
def step_end(self, run_context):
params = run_context.original_args()
cost_time = time.time() - self.epoch_begin_time
train_steps = params.cur_step_num -self.last_step
print(f'epoch {params.cur_epoch_num} cost time = {cost_time}, train step num: {train_steps}, '
f'one step time: {1000*cost_time/train_steps} ms, '
f'train samples per second of cluster: {device_num*train_steps*self.batch_size/cost_time:.1f}\n')
self.last_step = run_context.original_args().cur_step_num
```
`get_lr`生成学习率数组,其中每个元素对应每个step的学习率,这里学习率下降采用二次曲线的形式:
```python
def get_lr(global_step,
total_epochs,
steps_per_epoch,
lr_init=0.01,
lr_max=0.1,
warmup_epochs=5):
"""
Generate learning rate array.
Args:
global_step (int): Initial step of training.
total_epochs (int): Total epoch of training.
steps_per_epoch (float): Steps of one epoch.
lr_init (float): Initial learning rate. Default: 0.01.
lr_max (float): Maximum learning rate. Default: 0.1.
warmup_epochs (int): The number of warming up epochs. Default: 5.
Returns:
np.array, learning rate array.
"""
lr_each_step = []
total_steps = steps_per_epoch * total_epochs
warmup_steps = steps_per_epoch * warmup_epochs
if warmup_steps != 0:
inc_each_step = (float(lr_max) - float(lr_init)) / float(warmup_steps)
else:
inc_each_step = 0
for i in range(int(total_steps)):
if i < warmup_steps:
lr = float(lr_init) + inc_each_step * float(i)
else:
base = ( 1.0 - (float(i) - float(warmup_steps)) / (float(total_steps) - float(warmup_steps)) )
lr = float(lr_max) * base * base
if lr < 0.0:
lr = 0.0
lr_each_step.append(lr)
current_step = global_step
lr_each_step = np.array(lr_each_step).astype(np.float32)
learning_rate = lr_each_step[current_step:]
return learning_rate
```
MindSpore支持直接读取cifar10数据集:
```python
if device_num == 1 or not do_train:
ds = de.Cifar10Dataset(dataset_path, num_parallel_workers=8, shuffle=do_shuffle)
else:
ds = de.Cifar10Dataset(dataset_path, num_parallel_workers=8, shuffle=do_shuffle,
num_shards=device_num, shard_id=device_id)
```
导入并使用model_zoo里的resnet50模型:
```python
from mindspore.model_zoo.resnet import resnet50
# create model
net = resnet50(class_num = class_num)
```
使用数据增强,如随机裁剪、随机水平反转:
```python
# define map operations
random_crop_op = C.RandomCrop((32, 32), (4, 4, 4, 4))
random_horizontal_flip_op = C.RandomHorizontalFlip(device_id / (device_id + 1))
```
`model_zoo.resnet`中resnet50定义如下:
```python
def resnet50(class_num=10):
return ResNet(ResidualBlock,
[3, 4, 6, 3],
[64, 256, 512, 1024],
[256, 512, 1024, 2048],
[1, 2, 2, 2],
class_num)
```
ResNet类定义如下:
```python
class ResNet(nn.Cell):
"""
ResNet architecture.
Args:
block (Cell): Block for network.
layer_nums (list): Numbers of block in different layers.
in_channels (list): Input channel in each layer.
out_channels (list): Output channel in each layer.
strides (list): Stride size in each layer.
num_classes (int): The number of classes that the training images are belonging to.
Returns:
Tensor, output tensor.
Examples:
>>> ResNet(ResidualBlock,
>>> [3, 4, 6, 3],
>>> [64, 256, 512, 1024],
>>> [256, 512, 1024, 2048],
>>> [1, 2, 2, 2],
>>> 10)
"""
```
ResNet的不同版本均由5个阶段(stage)组成,其中ResNet50结构为Convx1 -> ResidualBlockx3 -> ResidualBlockx4 -> ResidualBlockx6 -> ResidualBlockx5 -> Pooling+FC。
`ResidualBlock`为残差模块,相比传统卷积多了一个short-cut支路,用于将浅层的信息直接传递到深层,使得网络可以很深,而不会出现训练时梯度消失/爆炸的问题:
```python
class ResidualBlock(nn.Cell):
expansion = 4
def __init__(self,
in_channel,
out_channel,
stride=1):
super(ResidualBlock, self).__init__()
channel = out_channel // self.expansion
self.conv1 = _conv1x1(in_channel, channel, stride=1)
self.bn1 = _bn(channel)
self.conv2 = _conv3x3(channel, channel, stride=stride)
self.bn2 = _bn(channel)
self.conv3 = _conv1x1(channel, out_channel, stride=1)
self.bn3 = _bn_last(out_channel)
self.relu = nn.ReLU()
# 如果in
self.down_sample = False
if stride != 1 or in_channel != out_channel:
self.down_sample = True
self.down_sample_layer = None
if self.down_sample:
self.down_sample_layer = nn.SequentialCell([_conv1x1(in_channel, out_channel, stride),
_bn(out_channel)])
self.add = P.TensorAdd()
def construct(self, x):
identity = x
out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)
out = self.conv2(out)
out = self.bn2(out)
out = self.relu(out)
out = self.conv3(out)
out = self.bn3(out)
if self.down_sample:
identity = self.down_sample_layer(identity)
# output为残差支路,identity为short-cut支路
out = self.add(out, identity)
out = self.relu(out)
return out
```
创建训练作业时,运行参数会通过脚本传参的方式输入给脚本代码,脚本必须解析传参才能在代码中使用相应参数。如data_url和train_url,分别对应数据存储路径(OBS路径)和训练输出路径(OBS路径)。脚本对传参进行解析后赋值到`args`变量里,在后续代码里可以使用。
```python
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('--data_url', required=True, default=None, help='Location of data.')
parser.add_argument('--train_url', required=True, default=None, help='Location of training outputs.')
parser.add_argument('--num_epochs', type=int, default=1, help='Number of training epochs.')
args, unknown = parser.parse_known_args()
```
MindSpore暂时没有提供直接访问OBS数据的接口,需要通过MoXing提供的API与OBS交互。将OBS中存储的数据拷贝至执行容器:
```python
import moxing as mox
mox.file.copy_parallel(src_url=args.data_url, dst_url='cifar10/')
```
如需将训练输出(如模型Checkpoint)从执行容器拷贝至OBS,请参考:
```python
import moxing as mox
mox.file.copy_parallel(src_url='output', dst_url='s3://OBS/PATH')
```
### 创建训练作业
可以参考[使用常用框架训练模型](https://support.huaweicloud.com/engineers-modelarts/modelarts_23_0238.html)来创建并启动训练作业。
创建训练作业的参考配置:
- 算法来源:常用框架->Ascend-Powered-Engine->MindSpore
- 代码目录:选择上述新建的OBS桶中的experiment_3目录
- 启动文件:选择上述新建的OBS桶中的experiment_3目录下的`resnet50_train.py`
- 数据来源:数据存储位置->选择上述新建的OBS桶中的experiment_3文件夹下的cifar10目录
- 训练输出位置:选择上述新建的OBS桶中的experiment_3目录并在其中创建output目录
- 作业日志路径:同训练输出位置
- 规格:Ascend:1*Ascend 910
- 其他均为默认
启动并查看训练过程:
1. 点击提交以开始训练;
2. 在训练作业列表里可以看到刚创建的训练作业,在训练作业页面可以看到版本管理;
3. 点击运行中的训练作业,在展开的窗口中可以查看作业配置信息,以及训练过程中的日志,日志会不断刷新,等训练作业完成后也可以下载日志到本地进行查看;
4. 在训练日志中可以看到`epoch 90 cost time = 27.963477849960327, train step num: 1562, one step time: 17.90235457743939 ms, train samples per second of cluster: 1787.5`等字段,即训练过程的性能数据;
5. 在训练日志中可以看到`epoch: 90 step: 1562, loss is 0.00250402`等字段,即训练过程的loss数据;
6. 在训练日志里可以看到`Evaluation result: {'acc': 0.9182692307692307}.`字段,即训练完成后的验证精度。
## 实验结论
本实验主要介绍使用MindSpore在CIFAR10数据集上训练ResNet50,了解了以下知识点:
- 性能测试
- 动态学习率
- model_zoo:resnet50
- cifar10数据集、数据增强
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""Create train or eval dataset."""
import os
import mindspore.common.dtype as mstype
import mindspore.dataset.engine as de
import mindspore.dataset.transforms.vision.c_transforms as C
import mindspore.dataset.transforms.c_transforms as C2
device_id = int(os.getenv('DEVICE_ID'))
device_num = int(os.getenv('RANK_SIZE'))
def create_dataset(dataset_path, do_train, repeat_num=1, batch_size=32):
"""
Create a train or eval dataset.
Args:
dataset_path (str): The path of dataset.
do_train (bool): Whether dataset is used for train or eval.
repeat_num (int): The repeat times of dataset. Default: 1.
batch_size (int): The batch size of dataset. Default: 32.
Returns:
Dataset.
"""
if do_train:
dataset_path = os.path.join(dataset_path, 'train')
do_shuffle = True
else:
dataset_path = os.path.join(dataset_path, 'eval')
do_shuffle = False
if device_num == 1 or not do_train:
ds = de.Cifar10Dataset(dataset_path, num_parallel_workers=8, shuffle=do_shuffle)
else:
ds = de.Cifar10Dataset(dataset_path, num_parallel_workers=8, shuffle=do_shuffle,
num_shards=device_num, shard_id=device_id)
resize_height = 224
resize_width = 224
buffer_size = 100
rescale = 1.0 / 255.0
shift = 0.0
# define map operations
random_crop_op = C.RandomCrop((32, 32), (4, 4, 4, 4))
random_horizontal_flip_op = C.RandomHorizontalFlip(device_id / (device_id + 1))
resize_op = C.Resize((resize_height, resize_width))
rescale_op = C.Rescale(rescale, shift)
normalize_op = C.Normalize([0.4914, 0.4822, 0.4465], [0.2023, 0.1994, 0.2010])
change_swap_op = C.HWC2CHW()
trans = []
if do_train:
trans += [random_crop_op, random_horizontal_flip_op]
trans += [resize_op, rescale_op, normalize_op, change_swap_op]
type_cast_op = C2.TypeCast(mstype.int32)
ds = ds.map(input_columns="label", num_parallel_workers=8, operations=type_cast_op)
ds = ds.map(input_columns="image", num_parallel_workers=8, operations=trans)
# apply batch operations
ds = ds.batch(batch_size, drop_remainder=True)
# apply dataset repeat operation
ds = ds.repeat(repeat_num)
return ds
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""ResNet50 model train with MindSpore"""
import os
import argparse
import random
import time
import numpy as np
import moxing as mox
from mindspore import context
from mindspore import Tensor
from mindspore.nn.optim.momentum import Momentum
from mindspore.nn.loss import SoftmaxCrossEntropyWithLogits
from mindspore.train.model import Model, ParallelMode
from mindspore.train.callback import Callback, LossMonitor
from mindspore.train.loss_scale_manager import FixedLossScaleManager
import mindspore.dataset.engine as de
from dataset import create_dataset, device_id, device_num
from mindspore.model_zoo.resnet import resnet50
random.seed(1)
np.random.seed(1)
de.config.set_seed(1)
class PerformanceCallback(Callback):
"""
Training performance callback.
Args:
batch_size (int): Batch number for one step.
"""
def __init__(self, batch_size):
super(PerformanceCallback, self).__init__()
self.batch_size = batch_size
self.last_step = 0
self.epoch_begin_time = 0
def step_begin(self, run_context):
self.epoch_begin_time = time.time()
def step_end(self, run_context):
params = run_context.original_args()
cost_time = time.time() - self.epoch_begin_time
train_steps = params.cur_step_num -self.last_step
print(f'epoch {params.cur_epoch_num} cost time = {cost_time}, train step num: {train_steps}, '
f'one step time: {1000*cost_time/train_steps} ms, '
f'train samples per second of cluster: {device_num*train_steps*self.batch_size/cost_time:.1f}\n')
self.last_step = run_context.original_args().cur_step_num
def get_lr(global_step,
total_epochs,
steps_per_epoch,
lr_init=0.01,
lr_max=0.1,
warmup_epochs=5):
"""
Generate learning rate array.
Args:
global_step (int): Initial step of training.
total_epochs (int): Total epoch of training.
steps_per_epoch (float): Steps of one epoch.
lr_init (float): Initial learning rate. Default: 0.01.
lr_max (float): Maximum learning rate. Default: 0.1.
warmup_epochs (int): The number of warming up epochs. Default: 5.
Returns:
np.array, learning rate array.
"""
lr_each_step = []
total_steps = steps_per_epoch * total_epochs
warmup_steps = steps_per_epoch * warmup_epochs
if warmup_steps != 0:
inc_each_step = (float(lr_max) - float(lr_init)) / float(warmup_steps)
else:
inc_each_step = 0
for i in range(int(total_steps)):
if i < warmup_steps:
lr = float(lr_init) + inc_each_step * float(i)
else:
base = ( 1.0 - (float(i) - float(warmup_steps)) / (float(total_steps) - float(warmup_steps)) )
lr = float(lr_max) * base * base
if lr < 0.0:
lr = 0.0
lr_each_step.append(lr)
current_step = global_step
lr_each_step = np.array(lr_each_step).astype(np.float32)
learning_rate = lr_each_step[current_step:]
return learning_rate
def resnet50_train(args_opt):
epoch_size = args_opt.epoch_size
batch_size = 32
class_num = 10
loss_scale_num = 1024
local_data_path = '/cache/data'
# set graph mode and parallel mode
context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", save_graphs=False)
context.set_context(enable_task_sink=True, device_id=device_id)
context.set_context(enable_loop_sink=True)
context.set_context(enable_mem_reuse=True)
if device_num > 1:
context.set_auto_parallel_context(device_num=device_num,
parallel_mode=ParallelMode.DATA_PARALLEL,
mirror_mean=True)
local_data_path = os.path.join(local_data_path, str(device_id))
# data download
print('Download data.')
mox.file.copy_parallel(src_url=args_opt.data_url, dst_url=local_data_path)
# create dataset
print('Create train and evaluate dataset.')
train_dataset = create_dataset(dataset_path=local_data_path, do_train=True,
repeat_num=epoch_size, batch_size=batch_size)
eval_dataset = create_dataset(dataset_path=local_data_path, do_train=False,
repeat_num=1, batch_size=batch_size)
train_step_size = train_dataset.get_dataset_size()
print('Create dataset success.')
# create model
net = resnet50(class_num = class_num)
loss = SoftmaxCrossEntropyWithLogits(sparse=True)
lr = Tensor(get_lr(global_step=0, total_epochs=epoch_size, steps_per_epoch=train_step_size))
opt = Momentum(net.trainable_params(), lr, momentum=0.9, weight_decay=1e-4, loss_scale=loss_scale_num)
loss_scale = FixedLossScaleManager(loss_scale_num, False)
model = Model(net, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale, metrics={'acc'})
# define performance callback to show ips and loss callback to show loss for every epoch
performance_cb = PerformanceCallback(batch_size)
loss_cb = LossMonitor()
cb = [performance_cb, loss_cb]
print(f'Start run training, total epoch: {epoch_size}.')
model.train(epoch_size, train_dataset, callbacks=cb)
if device_num == 1 or device_id == 0:
print(f'Start run evaluation.')
output = model.eval(eval_dataset)
print(f'Evaluation result: {output}.')
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='ResNet50 train.')
parser.add_argument('--data_url', required=True, default=None, help='Location of data.')
parser.add_argument('--train_url', required=True, default=None, help='Location of training outputs.')
parser.add_argument('--epoch_size', type=int, default=90, help='Train epoch size.')
args_opt, unknown = parser.parse_known_args()
resnet50_train(args_opt)
print('ResNet50 training success!')
<h1 style="text-align:center">自然语言处理应用</h1>
[TOC]
## 实验介绍
本实验主要介绍使用MindSpore开发和训练[BERT](https://arxiv.org/pdf/1810.04805.pdf)模型。建议先了解MindSpore官网上model_zoo上的BERT模型。
## 实验目的
- 了解如何使用MindSpore加载常用的NLP数据集。
- 了解MindSpore的model_zoo模块,以及如何使用model_zoo中的模型。
- 了解BERT模型的基本结构和编程方法。
## 预备知识
- 熟练使用Python,了解Shell及Linux操作系统基本知识。
- 具备一定的深度学习理论知识,如Embedding、Encoder、Decoder、损失函数、优化器,训练策略、Checkpoint等。
- 了解华为云的基本使用方法,包括[OBS(对象存储)](https://www.huaweicloud.com/product/obs.html)[ModelArts(AI开发平台)](https://www.huaweicloud.com/product/modelarts.html)[训练作业](https://support.huaweicloud.com/engineers-modelarts/modelarts_23_0046.html)等功能。华为云官网:https://www.huaweicloud.com
- 了解并熟悉MindSpore AI计算框架,MindSpore官网:https://www.mindspore.cn/
## 实验环境
- MindSpore 0.2.0(MindSpore版本会定期更新,本指导也会定期刷新,与版本配套);
- 华为云ModelArts:ModelArts是华为云提供的面向开发者的一站式AI开发平台,集成了昇腾AI处理器资源池,用户可以在该平台下体验MindSpore。ModelArts官网:https://www.huaweicloud.com/product/modelarts.html
## 实验准备
### 创建OBS桶
本实验需要使用华为云OBS存储脚本和数据集,可以参考[快速通过OBS控制台上传下载文件](https://support.huaweicloud.com/qs-obs/obs_qs_0001.html)了解使用OBS创建桶、上传文件、下载文件的使用方法。
> **提示:**华为云新用户使用OBS时通常需要创建和配置“访问密钥”,可以在使用OBS时根据提示完成创建和配置。也可以参考[获取访问密钥并完成ModelArts全局配置](https://support.huaweicloud.com/prepare-modelarts/modelarts_08_0002.html)获取并配置访问密钥。
创建OBS桶的参考配置如下:
- 区域:华北-北京四
- 数据冗余存储策略:单AZ存储
- 桶名称:如ms-course
- 存储类别:标准存储
- 桶策略:公共读
- 归档数据直读:关闭
- 企业项目、标签等配置:免
### 数据集准备
**预训练(pretrain)数据集**:下载[zhwiki数据集](https://dumps.wikimedia.org/zhwiki),使用[WikiExtractor](https://github.com/attardi/wil kiextractor)进行预处理,然后使用[google-research/bert:create_pretraining_data.py](https://github.com/google-research/bert/blob/master/create_pretraining_data.py)将数据转为TFRecord格式;
zhwiki为中文维基百科数据集,需要将其处理为具有上下文关系的句子对,然后基于词典vocab.txt对每个句子对进行token化,然后存储为特定数据格式(如Json、TFRecord、MindRecord)。
**微调(finetune)数据集**:使用[CLUEbenchmark/CLUEPretrainedModels中的脚本](https://github.com/CLUEbenchmark/CLUEPretrainedModels/blob/master/baselines/models/bert/run_classifier_tnews.sh)下载、处理TNEWS数据集,并将数据转为TFRecord格式。
TNEWS为今日头条中文新闻(短文本)分类(Short Text Classificaiton for News)数据集。该数据集来自今日头条的新闻版块,共提取了15个类别的新闻,包括旅游,教育,金融,军事等。数据量:训练集(53,360),验证集(10,000),测试集(10,000)。例子:
{"label": "102", "label_des": "news_entertainment", "sentence": "江疏影甜甜圈自拍,迷之角度竟这么好看,美吸引一切事物"}
每一条数据有三个属性,从前往后分别是 分类ID,分类名称,新闻字符串(仅含标题)。
本实验不进行数据预处理,请从网盘下载zhwiki_part和tnews数据集:
链接: https://pan.baidu.com/s/1F2S9Wr-ND0LMfATjv7WEug 提取码: gent
### 脚本准备
[课程gitee仓库](https://gitee.com/mindspore/course)上下载本实验相关脚本。其中`tokenization.py`来源于[google-research/bert](https://github.com/google-research/bert/blob/master/tokenization.py)
### 上传文件
将脚本和数据集上传到OBS桶中,组织为如下形式:
```
experiment_4
├── 脚本等文件
├── tnews
│   ├── bert_base.ckpt
│   ├── dev.tf_record
│   ├── dev_schema.json
│   ├── label2id.json
│   ├── train.tf_record
│   ├── train_schema.json
│   └── vocab.txt
└── zhwiki_part
├── schema.json
└── part.tfrecord
```
## 实验步骤
参考MindSpore开源仓库[BERT example](https://gitee.com/mindspore/mindspore/tree/r0.2/example/Bert_NEZHA_cnwiki)示例,并进行实验。
BERT(Bidirectional Encoder Representations from Transformers),即基于Transformer的双向编码表征。其中:
- Transformer是一种注意力(Attention)机制,用来学习文本中单词上下文之间的关系;
- 双向是指通过Masked Language Model(MLM)方法,随机的掩盖掉句子中的某些单词,然后利用前后未掩盖的信息来预测掩盖的单词;
更多BERT的介绍可以参考[Link](https://www.jianshu.com/p/d110d0c13063)
### 预训练BERT模型
[BERT](https://github.com/google-research/bert)模型包含由不同隐含层数(number hidden layers)和隐含层单元数(hidden size)构成的不同版本。通常情况下使用Bert需要预训练(pretrain)和微调(fine-tune)两个阶段。预训练BERT模型通常需要在大数据集上多卡并行训练多天。本实验先以部分zhwiki数据集为例展示预训练的过程。
BERT预训练阶段包含两个任务(两个输出):
- Mask语言模型(Mask LM):预测被掩盖掉(mask)的单词;
- NextSentence预测(NSP):判断句子对是否具有上下文关系,即句子B是否时句子A的下一句。
### 代码梳理
model_zoo:Bert_NEZHA中包含两个模块:
- `bert_for_pre_training.py`:包含`GetMaskedLMOutput`, `GetNextSentenceOutput`, `BertPreTraining`, `BertPretrainingLoss`, `BertNetworkWithLoss`, `BertTrainOneStepCell`, `BertTrainOneStepWithLossScaleCell`
- `bert_model.py`:包含`BertModel`依赖的
`GetMaskedLMOutput`接在BERT基础模型的后面,用于获取Mask LM的输出,
`GetNextSentenceOutput`在BERT基础模型的后面接了一个全连接层和Softmax层,用于获取NSP的输出。
```python
class GetNextSentenceOutput(nn.Cell):
def construct(self, input_tensor):
logits = self.dense(input_tensor)
logits = self.cast(logits, self.dtype)
log_prob = self.log_softmax(logits)
return log_prob
```
`BertPreTraining`将Mask LM模型和NSP模型封装成一个模型定义,`BertPretrainingLoss`将Mask LM Loss和NSP Loss加和封装为一个Loss定义。`BertNetworkWithLoss`根据模型输出计算Loss值。
```python
class BertNetworkWithLoss(nn.Cell):
"""
Provide bert pre-training loss through network.
Args:
config (BertConfig): The config of BertModel.
is_training (bool): Specifies whether to use the training mode.
use_one_hot_embeddings (bool): Specifies whether to use one-hot for embeddings. Default: False.
Returns:
Tensor, the loss of the network.
"""
def __init__(self, config, is_training, use_one_hot_embeddings=False):
super(BertNetworkWithLoss, self).__init__()
self.bert = BertPreTraining(config, is_training, use_one_hot_embeddings)
self.loss = BertPretrainingLoss(config)
self.cast = P.Cast()
def construct(self,
input_ids,
input_mask,
token_type_id,
next_sentence_labels,
masked_lm_positions,
masked_lm_ids,
masked_lm_weights):
prediction_scores, seq_relationship_score = \
self.bert(input_ids, input_mask, token_type_id, masked_lm_positions)
total_loss = self.loss(prediction_scores, seq_relationship_score,
masked_lm_ids, masked_lm_weights, next_sentence_labels)
return self.cast(total_loss, mstype.float32)
```
`BertTrainOneStepCell``BertNetworkWithLoss`上加上了反向传播和梯度更新(优化器),接收数据输入,更新模型权重。`BertTrainOneStepWithLossScaleCell`在此基础上引入了损失缩放(Loss Scaling)。损失缩放是为了应对反向传播过程中梯度数值较小,计算时(如采用FP16)会被当做0处理,所以先对Loss做一个放大,然后再对梯度进行缩小。
`bert_model.py``BertModel`接收数据输入,经过`EmbeddingLookup`, `EmbeddingPostprocessor`, `BertTransformer``Dense`计算后得到输出。
![BERT Model](https://www.lyrn.ai/wp-content/uploads/2018/11/transformer.png)
[1] 图片来源于https://www.lyrn.ai
```python
class BertModel(nn.Cell):
def construct(self, input_ids, token_type_ids, input_mask):
# embedding
if not self.token_type_ids_from_dataset:
token_type_ids = self.token_type_ids
word_embeddings, embedding_tables = self.bert_embedding_lookup(input_ids)
embedding_output = self.bert_embedding_postprocessor(token_type_ids,
word_embeddings)
# attention mask [batch_size, seq_length, seq_length]
attention_mask = self._create_attention_mask_from_input_mask(input_mask)
# bert encoder
encoder_output = self.bert_encoder(self.cast_compute_type(embedding_output),
attention_mask)
sequence_output = self.cast(encoder_output[self.last_idx], self.dtype)
# pooler
sequence_slice = self.slice(sequence_output,
(0, 0, 0),
(self.batch_size, 1, self.hidden_size),
(1, 1, 1))
first_token = self.squeeze_1(sequence_slice)
pooled_output = self.dense(first_token)
pooled_output = self.cast(pooled_output, self.dtype)
return sequence_output, pooled_output, embedding_tables
```
`EmbeddingLookup``EmbeddingPostprocessor`用于将输入转换成Embedding张量,Embedding如下图所示:
![Embedding](https://www.lyrn.ai/wp-content/uploads/2018/11/NSP.png)
[2] 图片来源于https://www.lyrn.ai 和https://arxiv.org/pdf/1810.04805.pdf
`BertTransformer`采用了下图中[Transformer](https://arxiv.org/pdf/1706.03762.pdf)中的encoder部分(左侧半边),包含`BertAttention->BertSelfAttention->BertEncoderCell`
![Transformer](https://pic2.zhimg.com/80/v2-0e85f4d440e621803d11408b39834dd1_720w.jpg)
[3] 图片来源于https://zhuanlan.zhihu.com/p/34781297 和https://arxiv.org/pdf/1706.03762.pdf
`BertAttention`为Multi-Head Attention:
![Multi-Head Attention](https://pic3.zhimg.com/80/v2-58d60594bc3e9cbe47faec82ef29fd76_720w.jpg)
[4] 图片来源于https://zhuanlan.zhihu.com/p/34781297 和https://arxiv.org/pdf/1706.03762.pdf
创建训练作业时,运行参数会通过脚本传参的方式输入给脚本代码,脚本必须解析传参才能在代码中使用相应参数。如data_url和train_url,分别对应数据存储路径(OBS路径)和训练输出路径(OBS路径)。脚本对传参进行解析后赋值到`args`变量里,在后续代码里可以使用。
```python
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('--data_url', required=True, default=None, help='Location of data.')
parser.add_argument('--train_url', required=True, default=None, help='Location of training outputs.')
parser.add_argument('--num_epochs', type=int, default=1, help='Number of training epochs.')
args, unknown = parser.parse_known_args()
```
MindSpore暂时没有提供直接访问OBS数据的接口,需要通过MoXing提供的API与OBS交互。将OBS中存储的数据拷贝至执行容器:
```python
import moxing as mox
mox.file.copy_parallel(src_url=args.data_url, dst_url='zhwiki_part/')
```
将训练模型Checkpoint从执行容器拷贝至OBS:
```python
import moxing as mox
mox.file.copy_parallel(src_url='bert_classfication-3_3335.ckpt',
dst_url=os.path.join(args.data_url, 'bert_classfication-3_3335.ckpt'))
```
#### 创建训练作业
可以参考[使用常用框架训练模型](https://support.huaweicloud.com/engineers-modelarts/modelarts_23_0238.html)来创建并启动训练作业。
创建训练作业的参考配置:
- 算法来源:常用框架->Ascend-Powered-Engine->MindSpore
- 代码目录:选择上述新建的OBS桶中的experiment_4目录
- 启动文件:选择上述新建的OBS桶中的experiment_4目录下的`pretrain.py`
- 数据来源:数据存储位置->选择上述新建的OBS桶中的experiment_4文件夹下的zhiwiki_part目录
- 训练输出位置:选择上述新建的OBS桶中的experiment_4目录并在其中创建pretrain_output目录
- 作业日志路径:同训练输出位置
- 规格:Ascend:1*Ascend 910
- 其他均为默认
启动并查看训练过程:
1. 点击提交以开始训练;
2. 在训练作业列表里可以看到刚创建的训练作业,在训练作业页面可以看到版本管理;
3. 点击运行中的训练作业,在展开的窗口中可以查看作业配置信息,以及训练过程中的日志,日志会不断刷新,等训练作业完成后也可以下载日志到本地进行查看;
4. 在训练日志中可以看到`epoch: 10 step: 10, loss is 10.741777`等字段,即预训练过程的loss数据。
### 微调BERT
通常情况下,需要基于与训练的BERT模型在各类细分任务上做微调(finetune),提高BERT在具体任务上的效果。本实验在CLUEbenchmark/CLUE提供的TNEWS数据集上对预训练的BERT做微调,即学习一个短文本分类任务。
预训练和微调两种情况下BERT基础模型是相同的,只是最后会在基础模型上加上不同的任务层,用于解决文本分类(新闻分类、情感分类)、序列标注(命名实体识别、问答)等任务。
微调BERT依赖如下几个模块:
- `finetune.py`:包含Loss打印、数据处理、优化器、模型保存等;
- `fintune_config.py`:模型和训练配置;
- `utils.py`模块中定义了finetune需要的模型,包含`BertFinetuneCell`, `BertCLSModel`, `BertNERModel`, `BertCLS``BertNER`
`BertFinetuneCell`等同于预训练时的`BertTrainOneStepCell`/`BertTrainOneStepWithLossScaleCell`,接收数据输入,更新模型权重。
`BertCLSModel`在BERT基础模型上接了分类任务头:
```python
class BertCLSModel(nn.Cell):
"""
This class is responsible for classification task evaluation, i.e. XNLI(num_labels=3),
LCQMC(num_labels=2), Chnsenti(num_labels=2). The returned output represents the final
logits as the results of log_softmax is propotional to that of softmax.
```
`BertNERModel`在BERT基础模型上接了命名实体识别(NER)任务头:
```python
class BertNERModel(nn.Cell):
"""
This class is responsible for sequence labeling task evaluation, i.e. NER(num_labels=11).
The returned output represents the final logits as the results of log_softmax is propotional to that of softmax.
"""
```
`BertCLS``BertNER`在任务模型上接了损失函数,作为`BertFinetuneCell`的输入。
#### 创建训练作业
可以参考[使用常用框架训练模型](https://support.huaweicloud.com/engineers-modelarts/modelarts_23_0238.html)来创建并启动训练作业。
### 代码梳理
创建训练作业的参考配置:
- 算法来源:常用框架->Ascend-Powered-Engine->MindSpore
- 代码目录:选择上述新建的OBS桶中的experiment_4目录
- 启动文件:选择上述新建的OBS桶中的experiment_4目录下的`fintune.py`
- 数据来源:数据存储位置->选择上述新建的OBS桶中的experiment_4文件夹下的tnews目录
- 训练输出位置:选择上述新建的OBS桶中的experiment_4目录并在其中创建finetune_output目录
- 作业日志路径:同训练输出位置
- 规格:Ascend:1*Ascend 910
- 其他均为默认
启动并查看训练过程:
1. 点击提交以开始训练,预训练过程约18分钟;
2. 在训练作业列表里可以看到刚创建的训练作业,在训练作业页面可以看到版本管理;
3. 点击运行中的训练作业,在展开的窗口中可以查看作业配置信息,以及训练过程中的日志,日志会不断刷新,等训练作业完成后也可以下载日志到本地进行查看;
4. 在训练日志中可以看到`epoch: 3, step: 10005, outputs are (1.4425085, False)`等字段,即微调过程的输出;
## 验证BERT
在TNEWS验证集上对微调后的BERT模型做验证(evaluation)。
### 代码梳理
验证BERT依赖如下几个模块:
- `evaluation.py`:包含Accuracy(分类任务)、F1值(NER任务)的计算,数据处理等。
- `evaluation_config.py`:模型和训练配置;
- `cluener_evaluation.py`:中文任务基准测评(Chinese Language Understanding Evaluation Benchmark)方法,未使用;
- `tokenization.py`:基于vocab.txt,将单词token化,未使用;
- `sample_process.py`:基于`tokenization.py`进行文本数据处理,未使用;
- `utils.py`:依赖微调时用的模型。
脚本传参、数据拷贝等代码参考预训练BERT中的解释。
#### 创建训练作业
可以参考[使用常用框架训练模型](https://support.huaweicloud.com/engineers-modelarts/modelarts_23_0238.html)来创建并启动训练作业。
创建训练作业的参考配置:
- 算法来源:常用框架->Ascend-Powered-Engine->MindSpore
- 代码目录:选择上述新建的OBS桶中的experiment_4目录
- 启动文件:选择上述新建的OBS桶中的experiment_4目录下的`fintune.py`
- 数据来源:数据存储位置->选择上述新建的OBS桶中的experiment_4文件夹下的tnews目录
- 训练输出位置:选择上述新建的OBS桶中的experiment_4目录并在其中创建eval_output目录
- 作业日志路径:同训练输出位置
- 规格:Ascend:1*Ascend 910
- 其他均为默认
启动并查看训练过程:
1. 点击提交以开始训练;
2. 在训练作业列表里可以看到刚创建的训练作业,在训练作业页面可以看到版本管理;
3. 点击运行中的训练作业,在展开的窗口中可以查看作业配置信息,以及训练过程中的日志,日志会不断刷新,等训练作业完成后也可以下载日志到本地进行查看;
4. 在训练日志中可以看到`acc_num 5437 , total_num 10000, accuracy 0.543700`字段,即微调完成后的验证精度。
## 实验结论
本实验主要介绍使用MindSpore在zhiwiki数据集上预训练BERT,在TNEWS短文本分类数据集上进行微调,包括以下特性:
- model_zoo:BERT
- BERT预训练
- BERT微调
- 不同的优化器
- 文本数据集处理
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
'''
CRF script.
'''
import numpy as np
import mindspore.nn as nn
from mindspore.ops import operations as P
from mindspore.common.tensor import Tensor
from mindspore.common.parameter import Parameter
import mindspore.common.dtype as mstype
class CRF(nn.Cell):
'''
Conditional Random Field
Args:
tag_to_index: The dict for tag to index mapping with extra "<START>" and "<STOP>"sign.
batch_size: Batch size, i.e., the length of the first dimension.
seq_length: Sequence length, i.e., the length of the second dimention.
is_training: Specifies whether to use training mode.
Returns:
Training mode: Tensor, total loss.
Evaluation mode: Tuple, the index for each step with the highest score; Tuple, the index for the last
step with the highest score.
'''
def __init__(self, tag_to_index, batch_size=1, seq_length=128, is_training=True):
super(CRF, self).__init__()
self.target_size = len(tag_to_index)
self.is_training = is_training
self.tag_to_index = tag_to_index
self.batch_size = batch_size
self.seq_length = seq_length
self.START_TAG = "<START>"
self.STOP_TAG = "<STOP>"
self.START_VALUE = Tensor(self.target_size-2, dtype=mstype.int32)
self.STOP_VALUE = Tensor(self.target_size-1, dtype=mstype.int32)
transitions = np.random.normal(size=(self.target_size, self.target_size)).astype(np.float32)
transitions[tag_to_index[self.START_TAG], :] = -10000
transitions[:, tag_to_index[self.STOP_TAG]] = -10000
self.transitions = Parameter(Tensor(transitions), name="transition_matrix")
self.cat = P.Concat(axis=-1)
self.argmax = P.ArgMaxWithValue(axis=-1)
self.log = P.Log()
self.exp = P.Exp()
self.sum = P.ReduceSum()
self.tile = P.Tile()
self.reduce_sum = P.ReduceSum(keep_dims=True)
self.reshape = P.Reshape()
self.expand = P.ExpandDims()
self.mean = P.ReduceMean()
init_alphas = np.ones(shape=(self.batch_size, self.target_size)) * -10000.0
init_alphas[:, self.tag_to_index[self.START_TAG]] = 0.
self.init_alphas = Tensor(init_alphas, dtype=mstype.float32)
self.cast = P.Cast()
self.reduce_max = P.ReduceMax(keep_dims=True)
self.on_value = Tensor(1.0, dtype=mstype.float32)
self.off_value = Tensor(0.0, dtype=mstype.float32)
self.onehot = P.OneHot()
def log_sum_exp(self, logits):
'''
Compute the log_sum_exp score for normalization factor.
'''
max_score = self.reduce_max(logits, -1) #16 5 5
score = self.log(self.reduce_sum(self.exp(logits - max_score), -1))
score = max_score + score
return score
def _realpath_score(self, features, label):
'''
Compute the emission and transition score for the real path.
'''
label = label * 1
concat_A = self.tile(self.reshape(self.START_VALUE, (1,)), (self.batch_size,))
concat_A = self.reshape(concat_A, (self.batch_size, 1))
labels = self.cat((concat_A, label))
onehot_label = self.onehot(label, self.target_size, self.on_value, self.off_value)
emits = features * onehot_label
labels = self.onehot(labels, self.target_size, self.on_value, self.off_value)
label1 = labels[:, 1:, :]
label2 = labels[:, :self.seq_length, :]
label1 = self.expand(label1, 3)
label2 = self.expand(label2, 2)
label_trans = label1 * label2
transitions = self.expand(self.expand(self.transitions, 0), 0)
trans = transitions * label_trans
score = self.sum(emits, (1, 2)) + self.sum(trans, (1, 2, 3))
stop_value_index = labels[:, (self.seq_length-1):self.seq_length, :]
stop_value = self.transitions[(self.target_size-1):self.target_size, :]
stop_score = stop_value * self.reshape(stop_value_index, (self.batch_size, self.target_size))
score = score + self.sum(stop_score, 1)
score = self.reshape(score, (self.batch_size, -1))
return score
def _normalization_factor(self, features):
'''
Compute the total score for all the paths.
'''
forward_var = self.init_alphas
forward_var = self.expand(forward_var, 1)
for idx in range(self.seq_length):
feat = features[:, idx:(idx+1), :]
emit_score = self.reshape(feat, (self.batch_size, self.target_size, 1))
next_tag_var = emit_score + self.transitions + forward_var
forward_var = self.log_sum_exp(next_tag_var)
forward_var = self.reshape(forward_var, (self.batch_size, 1, self.target_size))
terminal_var = forward_var + self.reshape(self.transitions[(self.target_size-1):self.target_size, :], (1, -1))
alpha = self.log_sum_exp(terminal_var)
alpha = self.reshape(alpha, (self.batch_size, -1))
return alpha
def _decoder(self, features):
'''
Viterbi decode for evaluation.
'''
backpointers = ()
forward_var = self.init_alphas
for idx in range(self.seq_length):
feat = features[:, idx:(idx+1), :]
feat = self.reshape(feat, (self.batch_size, self.target_size))
bptrs_t = ()
next_tag_var = self.expand(forward_var, 1) + self.transitions
best_tag_id, best_tag_value = self.argmax(next_tag_var)
bptrs_t += (best_tag_id,)
forward_var = best_tag_value + feat
backpointers += (bptrs_t,)
terminal_var = forward_var + self.reshape(self.transitions[(self.target_size-1):self.target_size, :], (1, -1))
best_tag_id, _ = self.argmax(terminal_var)
return backpointers, best_tag_id
def construct(self, features, label):
if self.is_training:
forward_score = self._normalization_factor(features)
gold_score = self._realpath_score(features, label)
return_value = self.mean(forward_score - gold_score)
else:
path_list, tag = self._decoder(features)
return_value = path_list, tag
return return_value
def postprocess(backpointers, best_tag_id):
'''
Do postprocess
'''
best_tag_id = best_tag_id.asnumpy()
batch_size = len(best_tag_id)
best_path = []
for i in range(batch_size):
best_path.append([])
best_local_id = best_tag_id[i]
best_path[-1].append(best_local_id)
for bptrs_t in reversed(backpointers):
bptrs_t = bptrs_t[0].asnumpy()
local_idx = bptrs_t[i]
best_local_id = local_idx[best_local_id]
best_path[-1].append(best_local_id)
# Pop off the start tag (we dont want to return that to the caller)
best_path[-1].pop()
best_path[-1].reverse()
return best_path
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
'''bert clue evaluation'''
import json
import numpy as np
from evaluation_config import cfg
import mindspore.common.dtype as mstype
from mindspore.common.tensor import Tensor
from CRF import postprocess
import tokenization
from sample_process import label_generation, process_one_example_p
vocab_file = "tnews/vocab.txt"
def process(model, text, sequence_length):
"""
process text.
"""
data = [text]
features = []
res = []
ids = []
tokenizer_ = tokenization.FullTokenizer(vocab_file=vocab_file)
for i in data:
feature = process_one_example_p(tokenizer_, i, max_seq_len=sequence_length)
features.append(feature)
input_ids, input_mask, token_type_id = feature
input_ids = Tensor(np.array(input_ids), mstype.int32)
input_mask = Tensor(np.array(input_mask), mstype.int32)
token_type_id = Tensor(np.array(token_type_id), mstype.int32)
if cfg.use_crf:
backpointers, best_tag_id = model.predict(input_ids, input_mask, token_type_id, Tensor(1))
best_path = postprocess(backpointers, best_tag_id)
logits = []
for ele in best_path:
logits.extend(ele)
ids = logits
else:
logits = model.predict(input_ids, input_mask, token_type_id, Tensor(1))
ids = logits.asnumpy()
ids = np.argmax(ids, axis=-1)
ids = list(ids)
res = label_generation(text, ids)
return res
def submit(model, path, sequence_length):
"""
submit task
"""
data = []
for line in open(path):
if not line.strip():
continue
_ = json.loads(line.strip())
res = process(model, _["text"], sequence_length)
print("_text", _["text"])
print("res:", res)
data.append(json.dumps({"label": res}, ensure_ascii=False))
open("ner_predict.json", "w").write("\n".join(data))
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""
Bert evaluation script.
"""
import os
os.environ['P_NUM'] = '16'
import numpy as np
from evaluation_config import cfg, bert_net_cfg
from utils import BertNER, BertCLS
import mindspore.common.dtype as mstype
from mindspore import context
from mindspore.common.tensor import Tensor
import mindspore.dataset as de
import mindspore.dataset.transforms.c_transforms as C
from mindspore.train.model import Model
from mindspore.train.serialization import load_checkpoint, load_param_into_net
from CRF import postprocess
from cluener_evaluation import submit
from finetune_config import tag_to_index
class Accuracy():
'''
calculate accuracy
'''
def __init__(self):
self.acc_num = 0
self.total_num = 0
def update(self, logits, labels):
labels = labels.asnumpy()
labels = np.reshape(labels, -1)
logits = logits.asnumpy()
logit_id = np.argmax(logits, axis=-1)
self.acc_num += np.sum(labels == logit_id)
self.total_num += len(labels)
print("=========================accuracy is ", self.acc_num / self.total_num)
class F1():
'''
calculate F1 score
'''
def __init__(self):
self.TP = 0
self.FP = 0
self.FN = 0
def update(self, logits, labels):
'''
update F1 score
'''
labels = labels.asnumpy()
labels = np.reshape(labels, -1)
if cfg.use_crf:
backpointers, best_tag_id = logits
best_path = postprocess(backpointers, best_tag_id)
logit_id = []
for ele in best_path:
logit_id.extend(ele)
else:
logits = logits.asnumpy()
logit_id = np.argmax(logits, axis=-1)
logit_id = np.reshape(logit_id, -1)
pos_eva = np.isin(logit_id, [i for i in range(1, cfg.num_labels)])
pos_label = np.isin(labels, [i for i in range(1, cfg.num_labels)])
self.TP += np.sum(pos_eva&pos_label)
self.FP += np.sum(pos_eva&(~pos_label))
self.FN += np.sum((~pos_eva)&pos_label)
def get_dataset(batch_size=1, repeat_count=1, distribute_file=''):
'''
get dataset
'''
ds = de.TFRecordDataset([cfg.data_file], cfg.schema_file, columns_list=["input_ids", "input_mask",
"segment_ids", "label_ids"])
type_cast_op = C.TypeCast(mstype.int32)
ds = ds.map(input_columns="segment_ids", operations=type_cast_op)
ds = ds.map(input_columns="input_mask", operations=type_cast_op)
ds = ds.map(input_columns="input_ids", operations=type_cast_op)
ds = ds.map(input_columns="label_ids", operations=type_cast_op)
ds = ds.repeat(repeat_count)
# apply shuffle operation
buffer_size = 960
ds = ds.shuffle(buffer_size=buffer_size)
# apply batch operations
ds = ds.batch(batch_size, drop_remainder=True)
return ds
def bert_predict(Evaluation):
'''
prediction function
'''
devid = int(os.getenv('DEVICE_ID'))
context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", device_id=devid)
dataset = get_dataset(bert_net_cfg.batch_size, 1)
if cfg.use_crf:
net_for_pretraining = Evaluation(bert_net_cfg, False, num_labels=len(tag_to_index), use_crf=True,
tag_to_index=tag_to_index, dropout_prob=0.0)
else:
net_for_pretraining = Evaluation(bert_net_cfg, False, num_labels)
net_for_pretraining.set_train(False)
param_dict = load_checkpoint(cfg.finetune_ckpt)
load_param_into_net(net_for_pretraining, param_dict)
model = Model(net_for_pretraining)
return model, dataset
def test_eval():
'''
evaluation function
'''
task_type = BertNER if cfg.task == "NER" else BertCLS
model, dataset = bert_predict(task_type)
if cfg.clue_benchmark:
submit(model, cfg.data_file, bert_net_cfg.seq_length)
else:
callback = F1() if cfg.task == "NER" else Accuracy()
columns_list = ["input_ids", "input_mask", "segment_ids", "label_ids"]
for data in dataset.create_dict_iterator():
input_data = []
for i in columns_list:
input_data.append(Tensor(data[i]))
input_ids, input_mask, token_type_id, label_ids = input_data
logits = model.predict(input_ids, input_mask, token_type_id, label_ids)
callback.update(logits, label_ids)
print("==============================================================")
if cfg.task == "NER":
print("Precision {:.6f} ".format(callback.TP / (callback.TP + callback.FP)))
print("Recall {:.6f} ".format(callback.TP / (callback.TP + callback.FN)))
print("F1 {:.6f} ".format(2*callback.TP / (2*callback.TP + callback.FP + callback.FP)))
else:
print("acc_num {} , total_num {}, accuracy {:.6f}".format(callback.acc_num, callback.total_num,
callback.acc_num / callback.total_num))
print("==============================================================")
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('--data_url', required=True, default=None, help='Location of data.')
parser.add_argument('--train_url', required=True, default=None, help='Location of training outputs.')
parser.add_argument('--num_epochs', type=int, default=1, help='Number of training epochs.')
args, unknown = parser.parse_known_args()
import moxing as mox
mox.file.copy_parallel(src_url=args.data_url, dst_url='tnews/')
num_labels = cfg.num_labels
test_eval()
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""
config settings, will be used in finetune.py
"""
from easydict import EasyDict as edict
import mindspore.common.dtype as mstype
from mindspore.model_zoo.Bert_NEZHA import BertConfig
cfg = edict({
'task': 'classfication',
'num_labels': 15,
'data_file': 'tnews/dev.tf_record',
'schema_file': 'tnews/dev_schema.json',
'finetune_ckpt': 'tnews/bert_classfication-3_3335.ckpt',
'use_crf': False,
'clue_benchmark': False,
})
bert_net_cfg = BertConfig(
batch_size=16 if not cfg.clue_benchmark else 1,
seq_length=128,
vocab_size=21128,
hidden_size=768,
num_hidden_layers=12,
num_attention_heads=12,
intermediate_size=3072,
hidden_act="gelu",
hidden_dropout_prob=0.0,
attention_probs_dropout_prob=0.0,
max_position_embeddings=512,
type_vocab_size=2,
initializer_range=0.02,
use_relative_positions=False,
input_mask_from_dataset=True,
token_type_ids_from_dataset=True,
dtype=mstype.float32,
compute_type=mstype.float16,
)
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
'''
Bert finetune script.
'''
import os
os.environ['P_NUM'] = '16'
from utils import BertFinetuneCell, BertCLS, BertNER
from finetune_config import cfg, bert_net_cfg, tag_to_index
import mindspore.common.dtype as mstype
import mindspore.communication.management as D
from mindspore import context
import mindspore.dataset as de
import mindspore.dataset.transforms.c_transforms as C
from mindspore.nn.wrap.loss_scale import DynamicLossScaleUpdateCell
from mindspore.nn.optim import AdamWeightDecay, AdamWeightDecayDynamicLR, Lamb, Momentum
from mindspore.train.model import Model
from mindspore.train.callback import Callback
from mindspore.train.callback import CheckpointConfig, ModelCheckpoint
from mindspore.train.serialization import load_checkpoint, load_param_into_net
class LossCallBack(Callback):
'''
Monitor the loss in training.
If the loss is NAN or INF, terminate training.
Note:
If per_print_times is 0, do not print loss.
Args:
per_print_times (int): Print loss every times. Default: 1.
'''
def __init__(self, per_print_times=1):
super(LossCallBack, self).__init__()
if not isinstance(per_print_times, int) or per_print_times < 0:
raise ValueError("print_step must be in and >= 0.")
self._per_print_times = per_print_times
def step_end(self, run_context):
cb_params = run_context.original_args()
print("epoch: {}, step: {}, outputs are {}".format(cb_params.cur_epoch_num, cb_params.cur_step_num,
str(cb_params.net_outputs)))
def get_dataset(batch_size=1, repeat_count=1, distribute_file=''):
'''
get dataset
'''
ds = de.TFRecordDataset([cfg.data_file], cfg.schema_file, columns_list=["input_ids", "input_mask",
"segment_ids", "label_ids"])
type_cast_op = C.TypeCast(mstype.int32)
ds = ds.map(input_columns="segment_ids", operations=type_cast_op)
ds = ds.map(input_columns="input_mask", operations=type_cast_op)
ds = ds.map(input_columns="input_ids", operations=type_cast_op)
ds = ds.map(input_columns="label_ids", operations=type_cast_op)
ds = ds.repeat(repeat_count)
# apply shuffle operation
buffer_size = 960
ds = ds.shuffle(buffer_size=buffer_size)
# apply batch operations
ds = ds.batch(batch_size, drop_remainder=True)
return ds
def test_train():
'''
finetune function
'''
devid = int(os.getenv('DEVICE_ID'))
context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", device_id=devid,
enable_mem_reuse=True, enable_task_sink=True)
#BertCLSTrain for classification
#BertNERTrain for sequence labeling
if cfg.task == 'NER':
if cfg.use_crf:
netwithloss = BertNER(bert_net_cfg, True, num_labels=len(tag_to_index), use_crf=True,
tag_to_index=tag_to_index, dropout_prob=0.1)
else:
netwithloss = BertNER(bert_net_cfg, True, num_labels=cfg.num_labels, dropout_prob=0.1)
else:
netwithloss = BertCLS(bert_net_cfg, True, num_labels=cfg.num_labels, dropout_prob=0.1)
dataset = get_dataset(bert_net_cfg.batch_size, cfg.epoch_num)
# optimizer
steps_per_epoch = dataset.get_dataset_size()
if cfg.optimizer == 'AdamWeightDecayDynamicLR':
optimizer = AdamWeightDecayDynamicLR(netwithloss.trainable_params(),
decay_steps=steps_per_epoch * cfg.epoch_num,
learning_rate=cfg.AdamWeightDecayDynamicLR.learning_rate,
end_learning_rate=cfg.AdamWeightDecayDynamicLR.end_learning_rate,
power=cfg.AdamWeightDecayDynamicLR.power,
#warmup_steps=steps_per_epoch,
weight_decay=cfg.AdamWeightDecayDynamicLR.weight_decay,
eps=cfg.AdamWeightDecayDynamicLR.eps)
#decay_filter=lambda x: 'LayerNorm' not in x.name and
# 'bias' not in x.name and
# 'layernorm' not in x.name)
elif cfg.optimizer == 'AdamWeightDecay':
optimizer = AdamWeightDecay(netwithloss.trainable_params(),
learning_rate=cfg.AdamWeightDecay.learning_rate,
weight_decay=cfg.AdamWeightDecay.weight_decay,
eps=cfg.AdamWeightDecay.eps,
decay_filter=lambda x: 'LayerNorm' not in x.name and
'bias' not in x.name and
'layernorm' not in x.name)
elif cfg.optimizer == 'Lamb':
optimizer = Lamb(netwithloss.trainable_params(), decay_steps=steps_per_epoch * cfg.epoch_num,
start_learning_rate=cfg.Lamb.start_learning_rate, end_learning_rate=cfg.Lamb.end_learning_rate,
power=cfg.Lamb.power, warmup_steps=steps_per_epoch, decay_filter=cfg.Lamb.decay_filter)
elif cfg.optimizer == 'Momentum':
optimizer = Momentum(netwithloss.trainable_params(), learning_rate=cfg.Momentum.learning_rate,
momentum=cfg.Momentum.momentum)
else:
raise Exception("Optimizer not supported.")
print("check steps, steps_per_epoch: ", steps_per_epoch)
# load checkpoint into network
ckpt_config = CheckpointConfig(save_checkpoint_steps=steps_per_epoch, keep_checkpoint_max=1)
ckpoint_cb = ModelCheckpoint(prefix=cfg.ckpt_prefix, directory=cfg.ckpt_dir, config=ckpt_config)
param_dict = load_checkpoint(cfg.pre_training_ckpt)
load_param_into_net(netwithloss, param_dict)
update_cell = DynamicLossScaleUpdateCell(loss_scale_value=2**32, scale_factor=2, scale_window=1000)
netwithgrads = BertFinetuneCell(netwithloss, optimizer=optimizer, scale_update_cell=update_cell)
model = Model(netwithgrads)
model.train(cfg.epoch_num, dataset, callbacks=[LossCallBack(), ckpoint_cb])
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('--data_url', required=True, default=None, help='Location of data.')
parser.add_argument('--train_url', required=True, default=None, help='Location of training outputs.')
parser.add_argument('--num_epochs', type=int, default=1, help='Number of training epochs.')
args, unknown = parser.parse_known_args()
import moxing as mox
mox.file.copy_parallel(src_url=args.data_url, dst_url='tnews/')
test_train()
mox.file.copy_parallel(src_url='bert_classfication-3_3335.ckpt',
dst_url=os.path.join(args.data_url, 'bert_classfication-3_3335.ckpt'))
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""
config settings, will be used in finetune.py
"""
from easydict import EasyDict as edict
import mindspore.common.dtype as mstype
from mindspore.model_zoo.Bert_NEZHA import BertConfig
cfg = edict({
'task': 'nothing',
'num_labels':15,
'data_file': 'tnews/train.tf_record',
'schema_file': 'tnews/train_schema.json',
'epoch_num': 3,
'ckpt_prefix': 'bert_classfication',
'ckpt_dir': None,
'pre_training_ckpt': 'tnews/bert_base.ckpt',
'use_crf': False,
'optimizer': 'AdamWeightDecayDynamicLR',
'AdamWeightDecay': edict({
'learning_rate': 2e-5,
'weight_decay': 1e-5,
'eps': 1e-6,
}),
'AdamWeightDecayDynamicLR': edict({
'learning_rate': 2e-5,
'end_learning_rate': 1e-7,
'power': 1.0,
'weight_decay': 1e-5,
'eps': 1e-6,
}),
'Lamb': edict({
'start_learning_rate': 2e-5,
'end_learning_rate': 1e-7,
'power': 1.0,
'decay_filter': lambda x: False,
}),
'Momentum': edict({
'learning_rate': 2e-5,
'momentum': 0.9,
}),
})
bert_net_cfg = BertConfig(
batch_size=16,
seq_length=128,
vocab_size=21128,
hidden_size=768,
num_hidden_layers=12,
num_attention_heads=12,
intermediate_size=3072,
hidden_act="gelu",
hidden_dropout_prob=0.1,
attention_probs_dropout_prob=0.1,
max_position_embeddings=512,
type_vocab_size=2,
initializer_range=0.02,
use_relative_positions=False,
input_mask_from_dataset=True,
token_type_ids_from_dataset=True,
dtype=mstype.float32,
compute_type=mstype.float16,
)
tag_to_index = {
"O": 0,
"S_address": 1,
"B_address": 2,
"M_address": 3,
"E_address": 4,
"S_book": 5,
"B_book": 6,
"M_book": 7,
"E_book": 8,
"S_company": 9,
"B_company": 10,
"M_company": 11,
"E_company": 12,
"S_game": 13,
"B_game": 14,
"M_game": 15,
"E_game": 16,
"S_government": 17,
"B_government": 18,
"M_government": 19,
"E_government": 20,
"S_movie": 21,
"B_movie": 22,
"M_movie": 23,
"E_movie": 24,
"S_name": 25,
"B_name": 26,
"M_name": 27,
"E_name": 28,
"S_organization": 29,
"B_organization": 30,
"M_organization": 31,
"E_organization": 32,
"S_position": 33,
"B_position": 34,
"M_position": 35,
"E_position": 36,
"S_scene": 37,
"B_scene": 38,
"M_scene": 39,
"E_scene": 40,
"<START>": 41,
"<STOP>": 42
}
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""train bert network without lossscale"""
import os
import numpy as np
from numpy import allclose
import mindspore.common.dtype as mstype
import mindspore.dataset.engine.datasets as de
import mindspore.dataset.transforms.c_transforms as C
from mindspore import context
from mindspore.common.tensor import Tensor
from mindspore.train.model import Model
from mindspore.train.callback import Callback, LossMonitor
from mindspore.train.loss_scale_manager import DynamicLossScaleManager
from mindspore.model_zoo.Bert_NEZHA import BertConfig, BertNetworkWithLoss, BertTrainOneStepWithLossScaleCell
from mindspore.nn.optim import Momentum
from mindspore import log as logger
DATA_DIR = ["zhwiki_part/part.tfrecord"]
SCHEMA_DIR = "zhwiki_part/schema.json"
def get_config(version='base', batch_size=1):
"""get config"""
if version == 'base':
bert_config = BertConfig(
batch_size=batch_size,
seq_length=128,
vocab_size=21136,
hidden_size=768,
num_hidden_layers=12,
num_attention_heads=12,
intermediate_size=3072,
hidden_act="gelu",
hidden_dropout_prob=0.1,
attention_probs_dropout_prob=0.1,
max_position_embeddings=512,
type_vocab_size=2,
initializer_range=0.02,
use_relative_positions=True,
input_mask_from_dataset=True,
token_type_ids_from_dataset=True,
dtype=mstype.float32,
compute_type=mstype.float32)
elif version == 'large':
bert_config = BertConfig(
batch_size=batch_size,
seq_length=128,
vocab_size=21136,
hidden_size=1024,
num_hidden_layers=12,
num_attention_heads=16,
intermediate_size=4096,
hidden_act="gelu",
hidden_dropout_prob=0.0,
attention_probs_dropout_prob=0.0,
max_position_embeddings=512,
type_vocab_size=2,
initializer_range=0.02,
use_relative_positions=True,
input_mask_from_dataset=True,
token_type_ids_from_dataset=True,
dtype=mstype.float32,
compute_type=mstype.float16)
elif version == 'large_mixed':
bert_config = BertConfig(
batch_size=batch_size,
seq_length=128,
vocab_size=21136,
hidden_size=1024,
num_hidden_layers=24,
num_attention_heads=16,
intermediate_size=4096,
hidden_act="gelu",
hidden_dropout_prob=0.0,
attention_probs_dropout_prob=0.0,
max_position_embeddings=512,
type_vocab_size=2,
initializer_range=0.02,
use_relative_positions=True,
input_mask_from_dataset=True,
token_type_ids_from_dataset=True,
dtype=mstype.float32,
compute_type=mstype.float32)
else:
bert_config = BertConfig(batch_size=batch_size)
return bert_config
def create_dataset():
"""test me de train dataset"""
# apply repeat operations
repeat_count = args.num_epochs
ds = de.TFRecordDataset(DATA_DIR, SCHEMA_DIR, columns_list=["input_ids", "input_mask", "segment_ids",
"next_sentence_labels", "masked_lm_positions",
"masked_lm_ids", "masked_lm_weights"], shuffle=False)
type_cast_op = C.TypeCast(mstype.int32)
ds = ds.map(input_columns="masked_lm_ids", operations=type_cast_op)
ds = ds.map(input_columns="masked_lm_positions", operations=type_cast_op)
ds = ds.map(input_columns="next_sentence_labels", operations=type_cast_op)
ds = ds.map(input_columns="segment_ids", operations=type_cast_op)
ds = ds.map(input_columns="input_mask", operations=type_cast_op)
ds = ds.map(input_columns="input_ids", operations=type_cast_op)
# apply batch operations
batch_size = int(os.getenv('BATCH_SIZE', '16'))
ds = ds.batch(batch_size, drop_remainder=True)
ds = ds.repeat(repeat_count)
return ds
class ModelCallback(Callback):
def __init__(self):
super(ModelCallback, self).__init__()
def step_end(self, run_context):
cb_params = run_context.original_args()
print("epoch: {}, outputs are: {}".format(cb_params.cur_epoch_num, str(cb_params.net_outputs)))
def test_bert_tdt():
"""test bert tdt"""
context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", reserve_class_name_in_scope=False)
context.set_context(enable_task_sink=True)
# context.set_context(enable_loop_sink=True)
context.set_context(enable_mem_reuse=True)
ds = create_dataset()
version = os.getenv('VERSION', 'base')
batch_size = int(os.getenv('BATCH_SIZE', '16'))
config = get_config(version=version, batch_size=batch_size)
netwithloss = BertNetworkWithLoss(config, True)
optimizer = Momentum(netwithloss.trainable_params(), learning_rate=2e-5, momentum=0.9)
scale_window = 3
scale_manager = DynamicLossScaleManager(2**32, 2, scale_window)
netwithgrads = BertTrainOneStepWithLossScaleCell(netwithloss, optimizer=optimizer, scale_update_cell=scale_manager.get_update_cell())
netwithgrads.set_train(True)
model = Model(netwithgrads)
callback = ModelCallback()
# loss_cb = LossMonitor(per_print_times=ds.get_dataset_size())
model.train(ds.get_repeat_count(), ds, callbacks=callback)
if __name__ == '__main__':
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('--data_url', required=True, default=None, help='Location of data.')
parser.add_argument('--train_url', required=True, default=None, help='Location of training outputs.')
parser.add_argument('--num_epochs', type=int, default=50, help='Number of training epochs.')
args, unknown = parser.parse_known_args()
import moxing as mox
mox.file.copy_parallel(src_url=args.data_url, dst_url='zhwiki_part/')
test_bert_tdt()
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""process txt"""
import re
import json
def process_one_example_p(tokenizer, text, max_seq_len=128):
"""process one testline"""
textlist = list(text)
tokens = []
for _, word in enumerate(textlist):
token = tokenizer.tokenize(word)
tokens.extend(token)
if len(tokens) >= max_seq_len - 1:
tokens = tokens[0:(max_seq_len - 2)]
ntokens = []
segment_ids = []
label_ids = []
ntokens.append("[CLS]")
segment_ids.append(0)
for _, token in enumerate(tokens):
ntokens.append(token)
segment_ids.append(0)
ntokens.append("[SEP]")
segment_ids.append(0)
input_ids = tokenizer.convert_tokens_to_ids(ntokens)
input_mask = [1] * len(input_ids)
while len(input_ids) < max_seq_len:
input_ids.append(0)
input_mask.append(0)
segment_ids.append(0)
label_ids.append(0)
ntokens.append("**NULL**")
assert len(input_ids) == max_seq_len
assert len(input_mask) == max_seq_len
assert len(segment_ids) == max_seq_len
feature = (input_ids, input_mask, segment_ids)
return feature
def label_generation(text, probs):
"""generate label"""
data = [text]
probs = [probs]
result = []
label2id = json.loads(open("tnews/label2id.json").read())
id2label = [k for k, v in label2id.items()]
for index, prob in enumerate(probs):
for v in prob[1:len(data[index]) + 1]:
result.append(id2label[int(v)])
labels = {}
start = None
index = 0
for _, t in zip("".join(data), result):
if re.search("^[BS]", t):
if start is not None:
label = result[index - 1][2:]
if labels.get(label):
te_ = text[start:index]
labels[label][te_] = [[start, index - 1]]
else:
te_ = text[start:index]
labels[label] = {te_: [[start, index - 1]]}
start = index
if re.search("^O", t):
if start is not None:
label = result[index - 1][2:]
if labels.get(label):
te_ = text[start:index]
labels[label][te_] = [[start, index - 1]]
else:
te_ = text[start:index]
labels[label] = {te_: [[start, index - 1]]}
start = None
index += 1
if start is not None:
label = result[start][2:]
if labels.get(label):
te_ = text[start:index]
labels[label][te_] = [[start, index - 1]]
else:
te_ = text[start:index]
labels[label] = {te_: [[start, index - 1]]}
return labels
"""Tokenization classes."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import collections
import re
import unicodedata
import six
#import tensorflow as tf
def validate_case_matches_checkpoint(do_lower_case, init_checkpoint):
"""Checks whether the casing config is consistent with the checkpoint name."""
# The casing has to be passed in by the user and there is no explicit check
# as to whether it matches the checkpoint. The casing information probably
# should have been stored in the bert_config.json file, but it's not, so
# we have to heuristically detect it to validate.
if not init_checkpoint:
return
m = re.match("^.*?([A-Za-z0-9_-]+)/bert_model.ckpt", init_checkpoint)
if m is None:
return
model_name = m.group(1)
lower_models = [
"uncased_L-24_H-1024_A-16", "uncased_L-12_H-768_A-12",
"multilingual_L-12_H-768_A-12", "chinese_L-12_H-768_A-12"
]
cased_models = [
"cased_L-12_H-768_A-12", "cased_L-24_H-1024_A-16",
"multi_cased_L-12_H-768_A-12"
]
is_bad_config = False
if model_name in lower_models and not do_lower_case:
is_bad_config = True
actual_flag = "False"
case_name = "lowercased"
opposite_flag = "True"
if model_name in cased_models and do_lower_case:
is_bad_config = True
actual_flag = "True"
case_name = "cased"
opposite_flag = "False"
if is_bad_config:
raise ValueError(
"You passed in `--do_lower_case=%s` with `--init_checkpoint=%s`. "
"However, `%s` seems to be a %s model, so you "
"should pass in `--do_lower_case=%s` so that the fine-tuning matches "
"how the model was pre-training. If this error is wrong, please "
"just comment out this check." % (actual_flag, init_checkpoint,
model_name, case_name, opposite_flag))
def convert_to_unicode(text):
"""Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
if six.PY3:
if isinstance(text, str):
return text
elif isinstance(text, bytes):
return text.decode("utf-8", "ignore")
else:
raise ValueError("Unsupported string type: %s" % (type(text)))
elif six.PY2:
if isinstance(text, str):
return text.decode("utf-8", "ignore")
elif isinstance(text, unicode):
return text
else:
raise ValueError("Unsupported string type: %s" % (type(text)))
else:
raise ValueError("Not running on Python2 or Python 3?")
def printable_text(text):
"""Returns text encoded in a way suitable for print or `tf.logging`."""
# These functions want `str` for both Python2 and Python3, but in one case
# it's a Unicode string and in the other it's a byte string.
if six.PY3:
if isinstance(text, str):
return text
elif isinstance(text, bytes):
return text.decode("utf-8", "ignore")
else:
raise ValueError("Unsupported string type: %s" % (type(text)))
elif six.PY2:
if isinstance(text, str):
return text
elif isinstance(text, unicode):
return text.encode("utf-8")
else:
raise ValueError("Unsupported string type: %s" % (type(text)))
else:
raise ValueError("Not running on Python2 or Python 3?")
def load_vocab(vocab_file):
"""Loads a vocabulary file into a dictionary."""
vocab = collections.OrderedDict()
index = 0
with open(vocab_file, "r") as reader:
while True:
token = convert_to_unicode(reader.readline())
if not token:
break
token = token.strip()
vocab[token] = index
index += 1
return vocab
def convert_by_vocab(vocab, items):
"""Converts a sequence of [tokens|ids] using the vocab."""
output = []
for item in items:
if item in vocab:
output.append(vocab[item])
else:
output.append(vocab['[UNK]'])
return output
def convert_tokens_to_ids(vocab, tokens):
return convert_by_vocab(vocab, tokens)
def convert_ids_to_tokens(inv_vocab, ids):
return convert_by_vocab(inv_vocab, ids)
def whitespace_tokenize(text):
"""Runs basic whitespace cleaning and splitting on a piece of text."""
text = text.strip()
if not text:
return []
tokens = text.split()
return tokens
class FullTokenizer(object):
"""Runs end-to-end tokenziation."""
def __init__(self, vocab_file, do_lower_case=True):
self.vocab = load_vocab(vocab_file)
self.inv_vocab = {v: k for k, v in self.vocab.items()}
self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
def tokenize(self, text):
split_tokens = []
for token in self.basic_tokenizer.tokenize(text):
for sub_token in self.wordpiece_tokenizer.tokenize(token):
split_tokens.append(sub_token)
return split_tokens
def convert_tokens_to_ids(self, tokens):
return convert_by_vocab(self.vocab, tokens)
def convert_ids_to_tokens(self, ids):
return convert_by_vocab(self.inv_vocab, ids)
class BasicTokenizer(object):
"""Runs basic tokenization (punctuation splitting, lower casing, etc.)."""
def __init__(self, do_lower_case=True):
"""Constructs a BasicTokenizer.
Args:
do_lower_case: Whether to lower case the input.
"""
self.do_lower_case = do_lower_case
def tokenize(self, text):
"""Tokenizes a piece of text."""
text = convert_to_unicode(text)
text = self._clean_text(text)
# This was added on November 1st, 2018 for the multilingual and Chinese
# models. This is also applied to the English models now, but it doesn't
# matter since the English models were not trained on any Chinese data
# and generally don't have any Chinese data in them (there are Chinese
# characters in the vocabulary because Wikipedia does have some Chinese
# words in the English Wikipedia.).
text = self._tokenize_chinese_chars(text)
orig_tokens = whitespace_tokenize(text)
split_tokens = []
for token in orig_tokens:
if self.do_lower_case:
token = token.lower()
token = self._run_strip_accents(token)
split_tokens.extend(self._run_split_on_punc(token))
output_tokens = whitespace_tokenize(" ".join(split_tokens))
return output_tokens
def _run_strip_accents(self, text):
"""Strips accents from a piece of text."""
text = unicodedata.normalize("NFD", text)
output = []
for char in text:
cat = unicodedata.category(char)
if cat == "Mn":
continue
output.append(char)
return "".join(output)
def _run_split_on_punc(self, text):
"""Splits punctuation on a piece of text."""
chars = list(text)
i = 0
start_new_word = True
output = []
while i < len(chars):
char = chars[i]
if _is_punctuation(char):
output.append([char])
start_new_word = True
else:
if start_new_word:
output.append([])
start_new_word = False
output[-1].append(char)
i += 1
return ["".join(x) for x in output]
def _tokenize_chinese_chars(self, text):
"""Adds whitespace around any CJK character."""
output = []
for char in text:
cp = ord(char)
if self._is_chinese_char(cp):
output.append(" ")
output.append(char)
output.append(" ")
else:
output.append(char)
return "".join(output)
def _is_chinese_char(self, cp):
"""Checks whether CP is the codepoint of a CJK character."""
# This defines a "chinese character" as anything in the CJK Unicode block:
# https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
#
# Note that the CJK Unicode block is NOT all Japanese and Korean characters,
# despite its name. The modern Korean Hangul alphabet is a different block,
# as is Japanese Hiragana and Katakana. Those alphabets are used to write
# space-separated words, so they are not treated specially and handled
# like the all of the other languages.
if ((cp >= 0x4E00 and cp <= 0x9FFF) or #
(cp >= 0x3400 and cp <= 0x4DBF) or #
(cp >= 0x20000 and cp <= 0x2A6DF) or #
(cp >= 0x2A700 and cp <= 0x2B73F) or #
(cp >= 0x2B740 and cp <= 0x2B81F) or #
(cp >= 0x2B820 and cp <= 0x2CEAF) or
(cp >= 0xF900 and cp <= 0xFAFF) or #
(cp >= 0x2F800 and cp <= 0x2FA1F)): #
return True
return False
def _clean_text(self, text):
"""Performs invalid character removal and whitespace cleanup on text."""
output = []
for char in text:
cp = ord(char)
if cp == 0 or cp == 0xfffd or _is_control(char):
continue
if _is_whitespace(char):
output.append(" ")
else:
output.append(char)
return "".join(output)
class WordpieceTokenizer(object):
"""Runs WordPiece tokenziation."""
def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=200):
self.vocab = vocab
self.unk_token = unk_token
self.max_input_chars_per_word = max_input_chars_per_word
def tokenize(self, text):
"""Tokenizes a piece of text into its word pieces.
This uses a greedy longest-match-first algorithm to perform tokenization
using the given vocabulary.
For example:
input = "unaffable"
output = ["un", "##aff", "##able"]
Args:
text: A single token or whitespace separated tokens. This should have
already been passed through `BasicTokenizer.
Returns:
A list of wordpiece tokens.
"""
text = convert_to_unicode(text)
output_tokens = []
for token in whitespace_tokenize(text):
chars = list(token)
if len(chars) > self.max_input_chars_per_word:
output_tokens.append(self.unk_token)
continue
is_bad = False
start = 0
sub_tokens = []
while start < len(chars):
end = len(chars)
cur_substr = None
while start < end:
substr = "".join(chars[start:end])
if start > 0:
substr = "##" + substr
if substr in self.vocab:
cur_substr = substr
break
end -= 1
if cur_substr is None:
is_bad = True
break
sub_tokens.append(cur_substr)
start = end
if is_bad:
output_tokens.append(self.unk_token)
else:
output_tokens.extend(sub_tokens)
return output_tokens
def _is_whitespace(char):
"""Checks whether `chars` is a whitespace character."""
# \t, \n, and \r are technically contorl characters but we treat them
# as whitespace since they are generally considered as such.
if char == " " or char == "\t" or char == "\n" or char == "\r":
return True
cat = unicodedata.category(char)
if cat == "Zs":
return True
return False
def _is_control(char):
"""Checks whether `chars` is a control character."""
# These are technically control characters but we count them as whitespace
# characters.
if char == "\t" or char == "\n" or char == "\r":
return False
cat = unicodedata.category(char)
if cat.startswith("C"):
return True
return False
def _is_punctuation(char):
"""Checks whether `chars` is a punctuation character."""
cp = ord(char)
# We treat all non-letter/number ASCII as punctuation.
# Characters such as "^", "$", and "`" are not in the Unicode
# Punctuation class but we treat them as punctuation anyways, for
# consistency.
if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or
(cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
return True
cat = unicodedata.category(char)
if cat.startswith("P"):
return True
return False
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
'''
Functional Cells used in Bert finetune and evaluation.
'''
import mindspore.nn as nn
from mindspore.common.initializer import TruncatedNormal
from mindspore.ops import operations as P
from mindspore.ops import functional as F
from mindspore.ops import composite as C
from mindspore.common.tensor import Tensor
from mindspore.common.parameter import Parameter, ParameterTuple
from mindspore.common import dtype as mstype
from mindspore.nn.wrap.grad_reducer import DistributedGradReducer
from mindspore.train.parallel_utils import ParallelMode
from mindspore.communication.management import get_group_size
from mindspore import context
from mindspore.model_zoo.Bert_NEZHA.bert_model import BertModel
from mindspore.model_zoo.Bert_NEZHA.bert_for_pre_training import ClipGradients
from CRF import CRF
GRADIENT_CLIP_TYPE = 1
GRADIENT_CLIP_VALUE = 1.0
grad_scale = C.MultitypeFuncGraph("grad_scale")
reciprocal = P.Reciprocal()
@grad_scale.register("Tensor", "Tensor")
def tensor_grad_scale(scale, grad):
return grad * reciprocal(scale)
class BertFinetuneCell(nn.Cell):
"""
Especifically defined for finetuning where only four inputs tensor are needed.
"""
def __init__(self, network, optimizer, scale_update_cell=None):
super(BertFinetuneCell, self).__init__(auto_prefix=False)
self.network = network
self.weights = ParameterTuple(network.trainable_params())
self.optimizer = optimizer
self.grad = C.GradOperation('grad',
get_by_list=True,
sens_param=True)
self.reducer_flag = False
self.allreduce = P.AllReduce()
self.parallel_mode = context.get_auto_parallel_context("parallel_mode")
if self.parallel_mode in [ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL]:
self.reducer_flag = True
self.grad_reducer = None
if self.reducer_flag:
mean = context.get_auto_parallel_context("mirror_mean")
degree = get_group_size()
self.grad_reducer = DistributedGradReducer(optimizer.parameters, mean, degree)
self.is_distributed = (self.parallel_mode != ParallelMode.STAND_ALONE)
self.clip_gradients = ClipGradients()
self.cast = P.Cast()
self.alloc_status = P.NPUAllocFloatStatus()
self.get_status = P.NPUGetFloatStatus()
self.clear_before_grad = P.NPUClearFloatStatus()
self.reduce_sum = P.ReduceSum(keep_dims=False)
self.depend_parameter_use = P.ControlDepend(depend_mode=1)
self.base = Tensor(1, mstype.float32)
self.less_equal = P.LessEqual()
self.hyper_map = C.HyperMap()
self.loss_scale = None
self.loss_scaling_manager = scale_update_cell
if scale_update_cell:
self.loss_scale = Parameter(Tensor(scale_update_cell.get_loss_scale(), dtype=mstype.float32),
name="loss_scale")
def construct(self,
input_ids,
input_mask,
token_type_id,
label_ids,
sens=None):
weights = self.weights
init = self.alloc_status()
loss = self.network(input_ids,
input_mask,
token_type_id,
label_ids)
if sens is None:
scaling_sens = self.loss_scale
else:
scaling_sens = sens
grads = self.grad(self.network, weights)(input_ids,
input_mask,
token_type_id,
label_ids,
self.cast(scaling_sens,
mstype.float32))
clear_before_grad = self.clear_before_grad(init)
F.control_depend(loss, init)
self.depend_parameter_use(clear_before_grad, scaling_sens)
grads = self.hyper_map(F.partial(grad_scale, scaling_sens), grads)
grads = self.clip_gradients(grads, GRADIENT_CLIP_TYPE, GRADIENT_CLIP_VALUE)
if self.reducer_flag:
grads = self.grad_reducer(grads)
flag = self.get_status(init)
flag_sum = self.reduce_sum(init, (0,))
if self.is_distributed:
flag_reduce = self.allreduce(flag_sum)
cond = self.less_equal(self.base, flag_reduce)
else:
cond = self.less_equal(self.base, flag_sum)
F.control_depend(grads, flag)
F.control_depend(flag, flag_sum)
overflow = cond
if sens is None:
overflow = self.loss_scaling_manager(self.loss_scale, cond)
if overflow:
succ = False
else:
succ = self.optimizer(grads)
ret = (loss, cond)
return F.depend(ret, succ)
class BertCLSModel(nn.Cell):
"""
This class is responsible for classification task evaluation, i.e. XNLI(num_labels=3),
LCQMC(num_labels=2), Chnsenti(num_labels=2). The returned output represents the final
logits as the results of log_softmax is propotional to that of softmax.
"""
def __init__(self, config, is_training, num_labels=2, dropout_prob=0.0, use_one_hot_embeddings=False):
super(BertCLSModel, self).__init__()
self.bert = BertModel(config, is_training, use_one_hot_embeddings)
self.cast = P.Cast()
self.weight_init = TruncatedNormal(config.initializer_range)
self.log_softmax = P.LogSoftmax(axis=-1)
self.dtype = config.dtype
self.num_labels = num_labels
self.dense_1 = nn.Dense(config.hidden_size, self.num_labels, weight_init=self.weight_init,
has_bias=True).to_float(config.compute_type)
self.dropout = nn.Dropout(1 - dropout_prob)
def construct(self, input_ids, input_mask, token_type_id):
_, pooled_output, _ = \
self.bert(input_ids, token_type_id, input_mask)
cls = self.cast(pooled_output, self.dtype)
cls = self.dropout(cls)
logits = self.dense_1(cls)
logits = self.cast(logits, self.dtype)
log_probs = self.log_softmax(logits)
return log_probs
class BertNERModel(nn.Cell):
"""
This class is responsible for sequence labeling task evaluation, i.e. NER(num_labels=11).
The returned output represents the final logits as the results of log_softmax is propotional to that of softmax.
"""
def __init__(self, config, is_training, num_labels=11, use_crf=False, dropout_prob=0.0,
use_one_hot_embeddings=False):
super(BertNERModel, self).__init__()
self.bert = BertModel(config, is_training, use_one_hot_embeddings)
self.cast = P.Cast()
self.weight_init = TruncatedNormal(config.initializer_range)
self.log_softmax = P.LogSoftmax(axis=-1)
self.dtype = config.dtype
self.num_labels = num_labels
self.dense_1 = nn.Dense(config.hidden_size, self.num_labels, weight_init=self.weight_init,
has_bias=True).to_float(config.compute_type)
self.dropout = nn.Dropout(1 - dropout_prob)
self.reshape = P.Reshape()
self.shape = (-1, config.hidden_size)
self.use_crf = use_crf
self.origin_shape = (config.batch_size, config.seq_length, self.num_labels)
def construct(self, input_ids, input_mask, token_type_id):
sequence_output, _, _ = \
self.bert(input_ids, token_type_id, input_mask)
seq = self.dropout(sequence_output)
seq = self.reshape(seq, self.shape)
logits = self.dense_1(seq)
logits = self.cast(logits, self.dtype)
if self.use_crf:
return_value = self.reshape(logits, self.origin_shape)
else:
return_value = self.log_softmax(logits)
return return_value
class CrossEntropyCalculation(nn.Cell):
"""
Cross Entropy loss
"""
def __init__(self, is_training=True):
super(CrossEntropyCalculation, self).__init__()
self.onehot = P.OneHot()
self.on_value = Tensor(1.0, mstype.float32)
self.off_value = Tensor(0.0, mstype.float32)
self.reduce_sum = P.ReduceSum()
self.reduce_mean = P.ReduceMean()
self.reshape = P.Reshape()
self.last_idx = (-1,)
self.neg = P.Neg()
self.cast = P.Cast()
self.is_training = is_training
def construct(self, logits, label_ids, num_labels):
if self.is_training:
label_ids = self.reshape(label_ids, self.last_idx)
one_hot_labels = self.onehot(label_ids, num_labels, self.on_value, self.off_value)
per_example_loss = self.neg(self.reduce_sum(one_hot_labels * logits, self.last_idx))
loss = self.reduce_mean(per_example_loss, self.last_idx)
return_value = self.cast(loss, mstype.float32)
else:
return_value = logits * 1.0
return return_value
class BertCLS(nn.Cell):
"""
Train interface for classification finetuning task.
"""
def __init__(self, config, is_training, num_labels=2, dropout_prob=0.0, use_one_hot_embeddings=False):
super(BertCLS, self).__init__()
self.bert = BertCLSModel(config, is_training, num_labels, dropout_prob, use_one_hot_embeddings)
self.loss = CrossEntropyCalculation(is_training)
self.num_labels = num_labels
def construct(self, input_ids, input_mask, token_type_id, label_ids):
log_probs = self.bert(input_ids, input_mask, token_type_id)
loss = self.loss(log_probs, label_ids, self.num_labels)
return loss
class BertNER(nn.Cell):
"""
Train interface for sequence labeling finetuning task.
"""
def __init__(self, config, is_training, num_labels=11, use_crf=False, tag_to_index=None, dropout_prob=0.0,
use_one_hot_embeddings=False):
super(BertNER, self).__init__()
self.bert = BertNERModel(config, is_training, num_labels, use_crf, dropout_prob, use_one_hot_embeddings)
if use_crf:
if not tag_to_index:
raise Exception("The dict for tag-index mapping should be provided for CRF.")
self.loss = CRF(tag_to_index, config.batch_size, config.seq_length, is_training)
else:
self.loss = CrossEntropyCalculation(is_training)
self.num_labels = num_labels
self.use_crf = use_crf
def construct(self, input_ids, input_mask, token_type_id, label_ids):
logits = self.bert(input_ids, input_mask, token_type_id)
if self.use_crf:
loss = self.loss(logits, label_ids)
else:
loss = self.loss(logits, label_ids, self.num_labels)
return loss
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册