From af85268f718ec91c62b1f27fd4640a22c7f6b044 Mon Sep 17 00:00:00 2001 From: dyonghan Date: Mon, 18 May 2020 09:45:17 +0800 Subject: [PATCH] !1 Initial version Initial version --- .gitignore | 136 +++++ README.en.md | 35 +- README.md | 38 +- experiment_1/1-LeNet5_MNIST.ipynb | 517 ++++++++++++++++ experiment_1/main.py | 108 ++++ experiment_2/2-Save_And_Load_Model.ipynb | 566 ++++++++++++++++++ experiment_2/main.py | 142 +++++ experiment_3/3-Computer_Vision.md | 344 +++++++++++ experiment_3/dataset.py | 86 +++ experiment_3/resnet50_train.py | 172 ++++++ experiment_4/4-Natural_Language_Processing.md | 374 ++++++++++++ experiment_4/CRF.py | 177 ++++++ experiment_4/cluener_evaluation.py | 73 +++ experiment_4/evaluation.py | 161 +++++ experiment_4/evaluation_config.py | 53 ++ experiment_4/finetune.py | 152 +++++ experiment_4/finetune_config.py | 124 ++++ experiment_4/pretrain.py | 167 ++++++ experiment_4/sample_process.py | 100 ++++ experiment_4/tokenization.py | 388 ++++++++++++ experiment_4/utils.py | 263 ++++++++ project_1/1-Model_Optimization.ipynb | 565 +++++++++++++++++ 22 files changed, 4674 insertions(+), 67 deletions(-) create mode 100644 .gitignore create mode 100644 experiment_1/1-LeNet5_MNIST.ipynb create mode 100644 experiment_1/main.py create mode 100644 experiment_2/2-Save_And_Load_Model.ipynb create mode 100644 experiment_2/main.py create mode 100644 experiment_3/3-Computer_Vision.md create mode 100644 experiment_3/dataset.py create mode 100644 experiment_3/resnet50_train.py create mode 100644 experiment_4/4-Natural_Language_Processing.md create mode 100644 experiment_4/CRF.py create mode 100644 experiment_4/cluener_evaluation.py create mode 100644 experiment_4/evaluation.py create mode 100644 experiment_4/evaluation_config.py create mode 100644 experiment_4/finetune.py create mode 100644 experiment_4/finetune_config.py create mode 100644 experiment_4/pretrain.py create mode 100644 experiment_4/sample_process.py create mode 100644 experiment_4/tokenization.py create mode 100644 experiment_4/utils.py create mode 100644 project_1/1-Model_Optimization.ipynb diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..09417a1 --- /dev/null +++ b/.gitignore @@ -0,0 +1,136 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# system file +.DS_Store +.swap + +# IDE +.idea/ diff --git a/README.en.md b/README.en.md index cef451a..054d389 100644 --- a/README.en.md +++ b/README.en.md @@ -1,36 +1,7 @@ # course -#### Description -{**When you're done, you can delete the content in this README and update the file with details for others getting started with your repository**} +The experimental guidance based on the MindSpore open source deep learning framework. It is only used for teaching or training purposes. -#### Software Architecture -Software architecture description +Part of the content comes from the open source community, internet or third party. If something violates your rights, please leave a message via issue or submit a pull request. -#### Installation - -1. xxxx -2. xxxx -3. xxxx - -#### Instructions - -1. xxxx -2. xxxx -3. xxxx - -#### Contribution - -1. Fork the repository -2. Create Feat_xxx branch -3. Commit your code -4. Create Pull Request - - -#### Gitee Feature - -1. You can use Readme\_XXX.md to support different languages, such as Readme\_en.md, Readme\_zh.md -2. Gitee blog [blog.gitee.com](https://blog.gitee.com) -3. Explore open source project [https://gitee.com/explore](https://gitee.com/explore) -4. The most valuable open source project [GVP](https://gitee.com/gvp) -5. The manual of Gitee [https://gitee.com/help](https://gitee.com/help) -6. The most popular members [https://gitee.com/gitee-stars/](https://gitee.com/gitee-stars/) +Please go to [MindSpore Open Source Community] (https://www.mindspore.cn/) for more videos and documentation tutorials. diff --git a/README.md b/README.md index 4d9ba31..698e3a7 100644 --- a/README.md +++ b/README.md @@ -1,39 +1,7 @@ # course -#### 介绍 -{**以下是码云平台说明,您可以替换此简介** -码云是 OSCHINA 推出的基于 Git 的代码托管平台(同时支持 SVN)。专为开发者提供稳定、高效、安全的云端软件开发协作平台 -无论是个人、团队、或是企业,都能够用码云实现代码托管、项目管理、协作开发。企业项目请看 [https://gitee.com/enterprises](https://gitee.com/enterprises)} +基于MindSpore开源深度学习框架的实验指导,仅用于教学或培训目的。 -#### 软件架构 -软件架构说明 +部分内容来源于开源社区、网络或第三方。如果有内容侵犯了您的权力,请通过issue留言,或者提交pull request。 - -#### 安装教程 - -1. xxxx -2. xxxx -3. xxxx - -#### 使用说明 - -1. xxxx -2. xxxx -3. xxxx - -#### 参与贡献 - -1. Fork 本仓库 -2. 新建 Feat_xxx 分支 -3. 提交代码 -4. 新建 Pull Request - - -#### 码云特技 - -1. 使用 Readme\_XXX.md 来支持不同的语言,例如 Readme\_en.md, Readme\_zh.md -2. 码云官方博客 [blog.gitee.com](https://blog.gitee.com) -3. 你可以 [https://gitee.com/explore](https://gitee.com/explore) 这个地址来了解码云上的优秀开源项目 -4. [GVP](https://gitee.com/gvp) 全称是码云最有价值开源项目,是码云综合评定出的优秀开源项目 -5. 码云官方提供的使用手册 [https://gitee.com/help](https://gitee.com/help) -6. 码云封面人物是一档用来展示码云会员风采的栏目 [https://gitee.com/gitee-stars/](https://gitee.com/gitee-stars/) +请前往[MindSpore开源社区](https://www.mindspore.cn/)获取更多视频和文档教程。 diff --git a/experiment_1/1-LeNet5_MNIST.ipynb b/experiment_1/1-LeNet5_MNIST.ipynb new file mode 100644 index 0000000..57f357e --- /dev/null +++ b/experiment_1/1-LeNet5_MNIST.ipynb @@ -0,0 +1,517 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "

基于LeNet5的手写数字识别

\n", + "\n", + "[TOC]\n", + "\n", + "## 实验介绍\n", + "\n", + "LeNet5 + MINST被誉为深度学习领域的“Hello world”。本实验主要介绍使用MindSpore在MNIST数据集上开发和训练一个LeNet5模型,并验证模型精度。\n", + "\n", + "## 实验目的\n", + "\n", + "- 了解如何使用MindSpore进行简单卷积神经网络的开发。\n", + "- 了解如何使用MindSpore进行简单图片分类任务的训练。\n", + "- 了解如何使用MindSpore进行简单图片分类任务的验证。\n", + "\n", + "## 预备知识\n", + "\n", + "- 熟练使用Python,了解Shell及Linux操作系统基本知识。\n", + "- 具备一定的深度学习理论知识,如卷积神经网络、损失函数、优化器,训练策略等。\n", + "- 了解华为云的基本使用方法,包括[OBS(对象存储)](https://www.huaweicloud.com/product/obs.html)、[ModelArts(AI开发平台)](https://www.huaweicloud.com/product/modelarts.html)、[Notebook(开发工具)](https://support.huaweicloud.com/engineers-modelarts/modelarts_23_0033.html)、[训练作业](https://support.huaweicloud.com/engineers-modelarts/modelarts_23_0046.html)等功能。华为云官网:https://www.huaweicloud.com\n", + "- 了解并熟悉MindSpore AI计算框架,MindSpore官网:https://www.mindspore.cn\n", + "\n", + "## 实验环境\n", + "\n", + "- MindSpore 0.2.0(MindSpore版本会定期更新,本指导也会定期刷新,与版本配套);\n", + "- 华为云ModelArts:ModelArts是华为云提供的面向开发者的一站式AI开发平台,集成了昇腾AI处理器资源池,用户可以在该平台下体验MindSpore。ModelArts官网:https://www.huaweicloud.com/product/modelarts.html\n", + "\n", + "## 实验准备\n", + "\n", + "### 创建OBS桶\n", + "\n", + "本实验需要使用华为云OBS存储实验脚本和数据集,可以参考[快速通过OBS控制台上传下载文件](https://support.huaweicloud.com/qs-obs/obs_qs_0001.html)了解使用OBS创建桶、上传文件、下载文件的使用方法。\n", + "\n", + "> **提示:**华为云新用户使用OBS时通常需要创建和配置“访问密钥”,可以在使用OBS时根据提示完成创建和配置。也可以参考[获取访问密钥并完成ModelArts全局配置](https://support.huaweicloud.com/prepare-modelarts/modelarts_08_0002.html)获取并配置访问密钥。\n", + "\n", + "创建OBS桶的参考配置如下:\n", + "\n", + "- 区域:华北-北京四\n", + "- 数据冗余存储策略:单AZ存储\n", + "- 桶名称:如ms-course\n", + "- 存储类别:标准存储\n", + "- 桶策略:公共读\n", + "- 归档数据直读:关闭\n", + "- 企业项目、标签等配置:免\n", + "\n", + "### 数据集准备\n", + "\n", + "MNIST是一个手写数字数据集,训练集包含60000张手写数字,测试集包含10000张手写数字,共10类。MNIST数据集的官网:[THE MNIST DATABASE](http://yann.lecun.com/exdb/mnist/)。\n", + "\n", + "从MNIST官网下载如下4个文件到本地并解压:\n", + "\n", + "```\n", + "train-images-idx3-ubyte.gz: training set images (9912422 bytes)\n", + "train-labels-idx1-ubyte.gz: training set labels (28881 bytes)\n", + "t10k-images-idx3-ubyte.gz: test set images (1648877 bytes)\n", + "t10k-labels-idx1-ubyte.gz: test set labels (4542 bytes)\n", + "```\n", + "\n", + "### 脚本准备\n", + "\n", + "从[课程gitee仓库](https://gitee.com/mindspore/course)上下载本实验相关脚本。\n", + "\n", + "### 上传文件\n", + "\n", + "将脚本和数据集上传到OBS桶中,组织为如下形式:\n", + "\n", + "```\n", + "experiment_1\n", + "├── MNIST\n", + "│   ├── test\n", + "│   │   ├── t10k-images-idx3-ubyte\n", + "│   │   └── t10k-labels-idx1-ubyte\n", + "│   └── train\n", + "│   ├── train-images-idx3-ubyte\n", + "│   └── train-labels-idx1-ubyte\n", + "└── 脚本等文件\n", + "```\n", + "\n", + "## 实验步骤(方案一)\n", + "\n", + "### 创建Notebook\n", + "\n", + "可以参考[创建并打开Notebook](https://support.huaweicloud.com/engineers-modelarts/modelarts_23_0034.html)来创建并打开本实验的Notebook脚本。\n", + "\n", + "创建Notebook的参考配置:\n", + "\n", + "- 计费模式:按需计费\n", + "- 名称:experiment_1\n", + "- 工作环境:Python3\n", + "- 资源池:公共资源\n", + "- 类型:Ascend\n", + "- 规格:单卡1*Ascend 910\n", + "- 存储位置:对象存储服务(OBS)->选择上述新建的OBS桶中的experiment_1文件夹\n", + "- 自动停止等配置:默认\n", + "\n", + "> **注意:**\n", + "> - 打开Notebook前,在Jupyter Notebook文件列表页面,勾选目录里的所有文件/文件夹(实验脚本和数据集),并点击列表上方的“Sync OBS”按钮,使OBS桶中的所有文件同时同步到Notebook工作环境中,这样Notebook中的代码才能访问数据集。参考[使用Sync OBS功能](https://support.huaweicloud.com/engineers-modelarts/modelarts_23_0038.html)。\n", + "> - 打开Notebook后,选择MindSpore环境作为Kernel。\n", + "\n", + "> **提示:**上述数据集和脚本的准备工作也可以在Notebook环境中完成,在Jupyter Notebook文件列表页面,点击右上角的\"New\"->\"Terminal\",进入Notebook环境所在终端,进入`work`目录,可以使用常用的linux shell命令,如`wget, gzip, tar, mkdir, mv`等,完成数据集和脚本的下载和准备。\n", + "\n", + "> **提示:**请从上至下阅读提示并执行代码框进行体验。代码框执行过程中左侧呈现[\\*],代码框执行完毕后左侧呈现如[1],[2]等。请等上一个代码框执行完毕后再执行下一个代码框。\n", + "\n", + "导入MindSpore模块和辅助模块:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "# os.environ['DEVICE_ID'] = '0'\n", + "\n", + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "\n", + "import mindspore as ms\n", + "import mindspore.context as context\n", + "import mindspore.dataset.transforms.c_transforms as C\n", + "import mindspore.dataset.transforms.vision.c_transforms as CV\n", + "\n", + "from mindspore.dataset.transforms.vision import Inter\n", + "from mindspore import nn, Tensor\n", + "from mindspore.train import Model\n", + "from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor\n", + "from mindspore.train.serialization import load_checkpoint, load_param_into_net\n", + "\n", + "context.set_context(mode=context.GRAPH_MODE, device_target='Ascend')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 数据处理\n", + "\n", + "在使用数据集训练网络前,首先需要对数据进行预处理,如下:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "DATA_DIR_TRAIN = \"MNIST/train\" # 训练集信息\n", + "DATA_DIR_TEST = \"MNIST/test\" # 测试集信息\n", + "\n", + "def create_dataset(training=True, num_epoch=1, batch_size=32, resize=(32, 32),\n", + " rescale=1/(255*0.3081), shift=-0.1307/0.3081, buffer_size=64):\n", + " ds = ms.dataset.MnistDataset(DATA_DIR_TRAIN if training else DATA_DIR_TEST)\n", + " \n", + " # define map operations\n", + " resize_op = CV.Resize(resize)\n", + " rescale_op = CV.Rescale(rescale, shift)\n", + " hwc2chw_op = CV.HWC2CHW()\n", + " \n", + " # apply map operations on images\n", + " ds = ds.map(input_columns=\"image\", operations=[resize_op, rescale_op, hwc2chw_op])\n", + " ds = ds.map(input_columns=\"label\", operations=C.TypeCast(ms.int32))\n", + " \n", + " ds = ds.shuffle(buffer_size=buffer_size)\n", + " ds = ds.batch(batch_size, drop_remainder=True)\n", + " ds = ds.repeat(num_epoch)\n", + " \n", + " return ds" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "对其中几张图片进行可视化,可以看到图片中的手写数字,图片的大小为32x32。" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "ds = create_dataset(training=False)\n", + "data = ds.create_dict_iterator().get_next()\n", + "images = data['image']\n", + "labels = data['label']\n", + "\n", + "for i in range(1, 5):\n", + " plt.subplot(2, 2, i)\n", + " plt.imshow(np.squeeze(images[i]))\n", + " plt.title('Number: %s' % labels[i])\n", + " plt.xticks([])\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 定义模型\n", + "\n", + "定义LeNet5模型,模型结构如下图所示。\n", + "\n", + "\n", + "[1] 图片来源于http://deeplearning.net" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [], + "source": [ + "class LeNet(nn.Cell):\n", + " def __init__(self):\n", + " super(LeNet, self).__init__()\n", + " self.relu = nn.ReLU()\n", + " self.conv1 = nn.Conv2d(1, 6, 5, stride=1, pad_mode='valid')\n", + " self.conv2 = nn.Conv2d(6, 16, 5, stride=1, pad_mode='valid')\n", + " self.pool = nn.MaxPool2d(kernel_size=2, stride=2)\n", + " self.flatten = nn.Flatten()\n", + " self.fc1 = nn.Dense(400, 120)\n", + " self.fc2 = nn.Dense(120, 84)\n", + " self.fc3 = nn.Dense(84, 10)\n", + " \n", + " def construct(self, input_x):\n", + " output = self.conv1(input_x)\n", + " output = self.relu(output)\n", + " output = self.pool(output)\n", + " output = self.conv2(output)\n", + " output = self.relu(output)\n", + " output = self.pool(output)\n", + " output = self.flatten(output)\n", + " output = self.fc1(output)\n", + " output = self.fc2(output)\n", + " output = self.fc3(output)\n", + " \n", + " return output" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 推理(训练前)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "使用matplotlib定义一个将推理结果可视化的辅助函数,如下:" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [], + "source": [ + "def plot_images(pred_fn, ds, net):\n", + " for i in range(1, 5):\n", + " pred, image, label = pred_fn(ds, net)\n", + " plt.subplot(2, 2, i)\n", + " plt.imshow(np.squeeze(image))\n", + " color = 'blue' if pred == label else 'red'\n", + " plt.title(\"prediction: {}, truth: {}\".format(pred, label), color=color)\n", + " plt.xticks([])\n", + " plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "使用随机初始化的LeNet模型对手写数字进行识别,可以看到识别结果是随机的(大多数情况下是错误的)。\n", + "\n", + "> **提示:**MindSpore提供的基础数据类型为Tensor,Tensor支持numpy、list、tuple作为输入,并将其转换为Tensor类型。" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "def infer(ds, model):\n", + " data = ds.get_next()\n", + " images = data['image']\n", + " labels = data['label']\n", + " output = model.predict(Tensor(data['image']))\n", + " pred = np.argmax(output.asnumpy(), axis=1)\n", + " return pred[0], images[0], labels[0]\n", + "\n", + "ds = create_dataset(training=False, batch_size=1).create_dict_iterator()\n", + "net = LeNet()\n", + "model = Model(net)\n", + "\n", + "plot_images(infer, ds, model)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 训练\n", + "\n", + "使用MNIST数据集对上述定义的LeNet模型进行训练。训练策略如下表所示,可以调整训练策略并查看训练效果,要求验证精度大于95%。\n", + "\n", + "| batch size | number of epochs | learning rate | optimizer |\n", + "| -- | -- | -- | -- |\n", + "| 32 | 3 | 0.01 | Momentum 0.9 |" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch: 1 step: 1875 ,loss is 2.3086565\n", + "epoch: 2 step: 1875 ,loss is 0.22017351\n", + "epoch: 3 step: 1875 ,loss is 0.025683485\n", + "Metrics: {'acc': 0.9742588141025641, 'loss': 0.08628832848253062}\n" + ] + } + ], + "source": [ + "os.system('rm -f *.ckpt *.ir *.meta') # 清理旧的运行文件\n", + "LOOP_SINK = context.get_context('enable_loop_sink')\n", + "\n", + "def test_train(lr=0.01, momentum=0.9, num_epoch=3, ckpt_name=\"a_lenet\"):\n", + " ds_train = create_dataset(num_epoch=num_epoch)\n", + " ds_eval = create_dataset(training=False)\n", + " steps_per_epoch = ds_train.get_dataset_size()\n", + " \n", + " net = LeNet()\n", + " loss = nn.loss.SoftmaxCrossEntropyWithLogits(is_grad=False, sparse=True, reduction='mean')\n", + " opt = nn.Momentum(net.trainable_params(), lr, momentum)\n", + " \n", + " ckpt_cfg = CheckpointConfig(save_checkpoint_steps=steps_per_epoch, keep_checkpoint_max=5)\n", + " ckpt_cb = ModelCheckpoint(prefix=ckpt_name, config=ckpt_cfg)\n", + " loss_cb = LossMonitor(per_print_times=1 if LOOP_SINK else steps_per_epoch)\n", + " \n", + " model = Model(net, loss, opt, metrics={'acc', 'loss'})\n", + " model.train(num_epoch, ds_train, callbacks=[ckpt_cb, loss_cb], dataset_sink_mode=True)\n", + " metrics = model.eval(ds_eval)\n", + " print('Metrics:', metrics)\n", + "\n", + "test_train()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 推理(训练后)\n", + "\n", + "使用训练后的LeNet模型对手写数字进行识别,可以看到识别结果基本上是正确的。" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "CKPT = 'a_lenet-3_1875.ckpt'\n", + "\n", + "ds = create_dataset(training=False, batch_size=1).create_dict_iterator()\n", + "net = LeNet()\n", + "param_dict = load_checkpoint(CKPT)\n", + "load_param_into_net(net, param_dict)\n", + "model = Model(net)\n", + "\n", + "plot_images(infer, ds, model)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 实验步骤(方案二)\n", + "\n", + "### 代码梳理\n", + "\n", + "创建训练作业时,运行参数会通过脚本传参的方式输入给脚本代码,脚本必须解析传参才能在代码中使用相应参数。如data_url和train_url,分别对应数据存储路径(OBS路径)和训练输出路径(OBS路径)。脚本对传参进行解析后赋值到`args`变量里,在后续代码里可以使用。\n", + "\n", + "```python\n", + "import argparse\n", + "parser = argparse.ArgumentParser()\n", + "parser.add_argument('--data_url', required=True, default=None, help='Location of data.')\n", + "parser.add_argument('--train_url', required=True, default=None, help='Location of training outputs.')\n", + "parser.add_argument('--num_epochs', type=int, default=1, help='Number of training epochs.')\n", + "args, unknown = parser.parse_known_args()\n", + "```\n", + "\n", + "MindSpore暂时没有提供直接访问OBS数据的接口,需要通过MoXing提供的API与OBS交互。将OBS中存储的数据拷贝至执行容器:\n", + "\n", + "```python\n", + "import moxing as mox\n", + "mox.file.copy_parallel(src_url=args.data_url, dst_url='MNIST/')\n", + "```\n", + "\n", + "如需将训练输出(如模型Checkpoint)从执行容器拷贝至OBS,请参考:\n", + "\n", + "```python\n", + "import moxing as mox\n", + "mox.file.copy_parallel(src_url='output', dst_url='s3://OBS/PATH')\n", + "```\n", + "\n", + "其他代码分析请参考方案一。\n", + "\n", + "### 创建训练作业\n", + "\n", + "可以参考[使用常用框架训练模型](https://support.huaweicloud.com/engineers-modelarts/modelarts_23_0238.html)来创建并启动训练作业。\n", + "\n", + "创建训练作业的参考配置:\n", + "\n", + "- 算法来源:常用框架->Ascend-Powered-Engine->MindSpore\n", + "- 代码目录:选择上述新建的OBS桶中的experiment_1目录\n", + "- 启动文件:选择上述新建的OBS桶中的experiment_1目录下的`main.py`\n", + "- 数据来源:数据存储位置->选择上述新建的OBS桶中的experiment_1目录下的MNIST目录\n", + "- 训练输出位置:选择上述新建的OBS桶中的experiment_1目录并在其中创建output目录\n", + "- 作业日志路径:同训练输出位置\n", + "- 规格:Ascend:1*Ascend 910\n", + "- 其他均为默认\n", + "\n", + "启动并查看训练过程:\n", + "\n", + "1. 点击提交以开始训练;\n", + "2. 在训练作业列表里可以看到刚创建的训练作业,在训练作业页面可以看到版本管理;\n", + "3. 点击运行中的训练作业,在展开的窗口中可以查看作业配置信息,以及训练过程中的日志,日志会不断刷新,等训练作业完成后也可以下载日志到本地进行查看;\n", + "4. 在训练日志中可以看到`epoch: 3 step: 1875 ,loss is 0.025683485`等字段,即训练过程的loss值;\n", + "5. 在训练日志中可以看到`Metrics: {'acc': 0.9742588141025641, 'loss': 0.08628832848253062}`字段,即训练完成后的验证精度。" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 实验小结\n", + "\n", + "本实验展示了如何使用MindSpore进行手写数字识别,以及开发、训练和使用LeNet模型。通过对LeNet模型做几代的训练,然后使用训练后的LeNet模型对手写数字进行识别,识别结果基本上是正确的。即LeNet学习到了如何进行手写数字识别。" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.5" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/experiment_1/main.py b/experiment_1/main.py new file mode 100644 index 0000000..5970660 --- /dev/null +++ b/experiment_1/main.py @@ -0,0 +1,108 @@ +# LeNet5 mnist + +import os +# os.environ['DEVICE_ID'] = '0' +# Log level includes 3(ERROR), 2(WARNING), 1(INFO), 0(DEBUG). +os.environ['GLOG_v'] = '1' + +import matplotlib.pyplot as plt +import numpy as np + +import mindspore as ms +import mindspore.context as context +import mindspore.dataset.transforms.c_transforms as C +import mindspore.dataset.transforms.vision.c_transforms as CV + +from mindspore.dataset.transforms.vision import Inter +from mindspore import nn, Tensor +from mindspore.train import Model +from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor +from mindspore.train.serialization import load_checkpoint, load_param_into_net + +context.set_context(mode=context.GRAPH_MODE, device_target='Ascend') + +DATA_DIR_TRAIN = "MNIST/train" # 训练集信息 +DATA_DIR_TEST = "MNIST/test" # 测试集信息 + + +def create_dataset(training=True, num_epoch=1, batch_size=32, resize=(32, 32), + rescale=1/(255*0.3081), shift=-0.1307/0.3081, buffer_size=64): + ds = ms.dataset.MnistDataset(DATA_DIR_TRAIN if training else DATA_DIR_TEST) + + # define map operations + resize_op = CV.Resize(resize) + rescale_op = CV.Rescale(rescale, shift) + hwc2chw_op = CV.HWC2CHW() + + # apply map operations on images + ds = ds.map(input_columns="image", operations=[resize_op, rescale_op, hwc2chw_op]) + ds = ds.map(input_columns="label", operations=C.TypeCast(ms.int32)) + + ds = ds.shuffle(buffer_size=buffer_size) + ds = ds.batch(batch_size, drop_remainder=True) + ds = ds.repeat(num_epoch) + + return ds + + +class LeNet(nn.Cell): + def __init__(self): + super(LeNet, self).__init__() + self.relu = nn.ReLU() + self.conv1 = nn.Conv2d(1, 6, 5, stride=1, pad_mode='valid') + self.conv2 = nn.Conv2d(6, 16, 5, stride=1, pad_mode='valid') + self.pool = nn.MaxPool2d(kernel_size=2, stride=2) + self.flatten = nn.Flatten() + self.fc1 = nn.Dense(400, 120) + self.fc2 = nn.Dense(120, 84) + self.fc3 = nn.Dense(84, 10) + + def construct(self, input_x): + output = self.conv1(input_x) + output = self.relu(output) + output = self.pool(output) + output = self.conv2(output) + output = self.relu(output) + output = self.pool(output) + output = self.flatten(output) + output = self.fc1(output) + output = self.fc2(output) + output = self.fc3(output) + + return output + + +LOOP_SINK = context.get_context('enable_loop_sink') + +def test_train(lr=0.01, momentum=0.9, num_epoch=3, ckpt_name="a_lenet"): + ds_train = create_dataset(num_epoch=num_epoch) + ds_eval = create_dataset(training=False) + steps_per_epoch = ds_train.get_dataset_size() + + net = LeNet() + loss = nn.loss.SoftmaxCrossEntropyWithLogits(is_grad=False, sparse=True, reduction='mean') + opt = nn.Momentum(net.trainable_params(), lr, momentum) + + ckpt_cfg = CheckpointConfig(save_checkpoint_steps=steps_per_epoch, keep_checkpoint_max=5) + ckpt_cb = ModelCheckpoint(prefix=ckpt_name, config=ckpt_cfg) + loss_cb = LossMonitor(per_print_times=1 if LOOP_SINK else steps_per_epoch) + + model = Model(net, loss, opt, metrics={'acc', 'loss'}) + model.train(num_epoch, ds_train, callbacks=[ckpt_cb, loss_cb], dataset_sink_mode=True) + metrics = model.eval(ds_eval) + print('Metrics:', metrics) + +if __name__ == "__main__": + import argparse + parser = argparse.ArgumentParser() + parser.add_argument('--data_url', required=True, default=None, help='Location of data.') + parser.add_argument('--train_url', required=True, default=None, help='Location of training outputs.') + parser.add_argument('--num_epochs', type=int, default=1, help='Number of training epochs.') + args, unknown = parser.parse_known_args() + + import moxing as mox + mox.file.copy_parallel(src_url=args.data_url, dst_url='MNIST/') + + os.system('rm -f *.ckpt *.ir *.meta') # 清理旧的运行文件 + + test_train() diff --git a/experiment_2/2-Save_And_Load_Model.ipynb b/experiment_2/2-Save_And_Load_Model.ipynb new file mode 100644 index 0000000..7cb3a14 --- /dev/null +++ b/experiment_2/2-Save_And_Load_Model.ipynb @@ -0,0 +1,566 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "

训练时模型的保存和加载

\n", + "\n", + "[TOC]\n", + "\n", + "## 实验介绍\n", + "\n", + "本实验主要介绍使用MindSpore实现训练时模型的保存和加载。训练过程中保存模型以及训练中断后基于断点继续训练是一项非常常用的功能。建议先阅读MindSpore官网教程中关于模型参数保存和加载的内容。\n", + "\n", + "## 实验目的\n", + "\n", + "- 了解如何使用MindSpore实现训练时模型的保存。\n", + "- 了解如何使用MindSpore加载保存的模型文件并继续训练。\n", + "- 了解如何MindSpore的Callback功能。\n", + "\n", + "## 预备知识\n", + "\n", + "- 熟练使用Python,了解Shell及Linux操作系统基本知识。\n", + "- 具备一定的深度学习理论知识,如卷积神经网络、损失函数、优化器,训练策略、Checkpoint等。\n", + "- 了解华为云的基本使用方法,包括[OBS(对象存储)](https://www.huaweicloud.com/product/obs.html)、[ModelArts(AI开发平台)](https://www.huaweicloud.com/product/modelarts.html)、[Notebook(开发工具)](https://support.huaweicloud.com/engineers-modelarts/modelarts_23_0033.html)、[训练作业](https://support.huaweicloud.com/engineers-modelarts/modelarts_23_0046.html)等功能。华为云官网:https://www.huaweicloud.com\n", + "- 了解并熟悉MindSpore AI计算框架,MindSpore官网:https://www.mindspore.cn/\n", + "\n", + "## 实验环境\n", + "\n", + "- MindSpore 0.2.0(MindSpore版本会定期更新,本指导也会定期刷新,与版本配套);\n", + "- 华为云ModelArts:ModelArts是华为云提供的面向开发者的一站式AI开发平台,集成了昇腾AI处理器资源池,用户可以在该平台下体验MindSpore。ModelArts官网:https://www.huaweicloud.com/product/modelarts.html\n", + "\n", + "## 实验准备\n", + "\n", + "### 创建OBS桶\n", + "\n", + "本实验需要使用华为云OBS存储实验脚本和数据集,可以参考[快速通过OBS控制台上传下载文件](https://support.huaweicloud.com/qs-obs/obs_qs_0001.html)了解使用OBS创建桶、上传文件、下载文件的使用方法。\n", + "\n", + "> **提示:**华为云新用户使用OBS时通常需要创建和配置“访问密钥”,可以在使用OBS时根据提示完成创建和配置。也可以参考[获取访问密钥并完成ModelArts全局配置](https://support.huaweicloud.com/prepare-modelarts/modelarts_08_0002.html)获取并配置访问密钥。\n", + "\n", + "创建OBS桶的参考配置如下:\n", + "\n", + "- 区域:华北-北京四\n", + "- 数据冗余存储策略:单AZ存储\n", + "- 桶名称:如ms-course\n", + "- 存储类别:标准存储\n", + "- 桶策略:公共读\n", + "- 归档数据直读:关闭\n", + "- 企业项目、标签等配置:免\n", + "\n", + "### 数据集准备\n", + "\n", + "MNIST是一个手写数字数据集,训练集包含60000张手写数字,测试集包含10000张手写数字,共10类。MNIST数据集的官网:[THE MNIST DATABASE](http://yann.lecun.com/exdb/mnist/)。\n", + "\n", + "从MNIST官网下载如下4个文件到本地并解压:\n", + "\n", + "```\n", + "train-images-idx3-ubyte.gz: training set images (9912422 bytes)\n", + "train-labels-idx1-ubyte.gz: training set labels (28881 bytes)\n", + "t10k-images-idx3-ubyte.gz: test set images (1648877 bytes)\n", + "t10k-labels-idx1-ubyte.gz: test set labels (4542 bytes)\n", + "```\n", + "\n", + "### 脚本准备\n", + "\n", + "从[课程gitee仓库](https://gitee.com/mindspore/course)上下载本实验相关脚本。\n", + "\n", + "### 上传文件\n", + "\n", + "将脚本和数据集上传到OBS桶中,组织为如下形式:\n", + "\n", + "```\n", + "experiment_1\n", + "├── MNIST\n", + "│   ├── test\n", + "│   │   ├── t10k-images-idx3-ubyte\n", + "│   │   └── t10k-labels-idx1-ubyte\n", + "│   └── train\n", + "│   ├── train-images-idx3-ubyte\n", + "│   └── train-labels-idx1-ubyte\n", + "└── 脚本等文件\n", + "```\n", + "\n", + "## 实验步骤(方案一)\n", + "\n", + "### 创建Notebook\n", + "\n", + "可以参考[创建并打开Notebook](https://support.huaweicloud.com/engineers-modelarts/modelarts_23_0034.html)来创建并打开本实验的Notebook脚本。\n", + "\n", + "创建Notebook的参考配置:\n", + "\n", + "- 计费模式:按需计费\n", + "- 名称:experiment_2\n", + "- 工作环境:Python3\n", + "- 资源池:公共资源\n", + "- 类型:Ascend\n", + "- 规格:单卡1*Ascend 910\n", + "- 存储位置:对象存储服务(OBS)->选择上述新建的OBS桶中的experiment_2文件夹\n", + "- 自动停止等配置:默认\n", + "\n", + "> **注意:**\n", + "> - 打开Notebook前,在Jupyter Notebook文件列表页面,勾选目录里的所有文件/文件夹(实验脚本和数据集),并点击列表上方的“Sync OBS”按钮,使OBS桶中的所有文件同时同步到Notebook工作环境中,这样Notebook中的代码才能访问数据集。参考[使用Sync OBS功能](https://support.huaweicloud.com/engineers-modelarts/modelarts_23_0038.html)。\n", + "> - 打开Notebook后,选择MindSpore环境作为Kernel。\n", + "\n", + "> **提示:**上述数据集和脚本的准备工作也可以在Notebook环境中完成,在Jupyter Notebook文件列表页面,点击右上角的\"New\"->\"Terminal\",进入Notebook环境所在终端,进入`work`目录,可以使用常用的linux shell命令,如`wget, gzip, tar, mkdir, mv`等,完成数据集和脚本的下载和准备。\n", + "\n", + "> **提示:**请从上至下阅读提示并执行代码框进行体验。代码框执行过程中左侧呈现[\\*],代码框执行完毕后左侧呈现如[1],[2]等。请等上一个代码框执行完毕后再执行下一个代码框。\n", + "\n", + "导入MindSpore模块和辅助模块:" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "# os.environ['DEVICE_ID'] = '0'\n", + "\n", + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "\n", + "import mindspore as ms\n", + "import mindspore.context as context\n", + "import mindspore.dataset.transforms.c_transforms as C\n", + "import mindspore.dataset.transforms.vision.c_transforms as CV\n", + "\n", + "from mindspore.dataset.transforms.vision import Inter\n", + "from mindspore import nn, Tensor\n", + "from mindspore.train import Model\n", + "from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor\n", + "from mindspore.train.serialization import load_checkpoint, load_param_into_net\n", + "\n", + "import logging; logging.getLogger('matplotlib.font_manager').disabled = True\n", + "\n", + "context.set_context(mode=context.GRAPH_MODE, device_target='Ascend')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 数据处理\n", + "\n", + "在使用数据集训练网络前,首先需要对数据进行预处理,如下:" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "DATA_DIR_TRAIN = \"MNIST/train\" # 训练集信息\n", + "DATA_DIR_TEST = \"MNIST/test\" # 测试集信息\n", + "\n", + "def create_dataset(training=True, num_epoch=1, batch_size=32, resize=(32, 32),\n", + " rescale=1/(255*0.3081), shift=-0.1307/0.3081, buffer_size=64):\n", + " ds = ms.dataset.MnistDataset(DATA_DIR_TRAIN if training else DATA_DIR_TEST)\n", + " \n", + " # define map operations\n", + " resize_op = CV.Resize(resize)\n", + " rescale_op = CV.Rescale(rescale, shift)\n", + " hwc2chw_op = CV.HWC2CHW()\n", + " \n", + " # apply map operations on images\n", + " ds = ds.map(input_columns=\"image\", operations=[resize_op, rescale_op, hwc2chw_op])\n", + " ds = ds.map(input_columns=\"label\", operations=C.TypeCast(ms.int32))\n", + " \n", + " ds = ds.shuffle(buffer_size=buffer_size)\n", + " ds = ds.batch(batch_size, drop_remainder=True)\n", + " ds = ds.repeat(num_epoch)\n", + " \n", + " return ds" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 定义模型\n", + "\n", + "定义LeNet5模型,模型结构如下图所示。\n", + "\n", + "\n", + "[1] 图片来源于http://deeplearning.net" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "class LeNet(nn.Cell):\n", + " def __init__(self):\n", + " super(LeNet, self).__init__()\n", + " self.relu = nn.ReLU()\n", + " self.conv1 = nn.Conv2d(1, 6, 5, stride=1, pad_mode='valid')\n", + " self.conv2 = nn.Conv2d(6, 16, 5, stride=1, pad_mode='valid')\n", + " self.pool = nn.MaxPool2d(kernel_size=2, stride=2)\n", + " self.flatten = nn.Flatten()\n", + " self.fc1 = nn.Dense(400, 120)\n", + " self.fc2 = nn.Dense(120, 84)\n", + " self.fc3 = nn.Dense(84, 10)\n", + " \n", + " def construct(self, input_x):\n", + " output = self.conv1(input_x)\n", + " output = self.relu(output)\n", + " output = self.pool(output)\n", + " output = self.conv2(output)\n", + " output = self.relu(output)\n", + " output = self.pool(output)\n", + " output = self.flatten(output)\n", + " output = self.fc1(output)\n", + " output = self.fc2(output)\n", + " output = self.fc3(output)\n", + " \n", + " return output" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 保存模型Checkpoint\n", + "\n", + "使用MNIST数据集对上述定义的LeNet5模型进行单机单卡训练,包含:\n", + "\n", + "- 在MNIST数据集上训练模型。\n", + "- 通过`ModelCheckpoint`保存Checkpoint。\n", + "- 通过`LossMonitor`输出训练过程中的Loss。\n", + "\n", + "Callback是模型训练/测试过程中的一种调试工具,可用在训练/测试过程中执行特定的任务。MindSpore框架提供的Callback:\n", + "\n", + "- `ModelCheckpoint`:保存网络模型和参数,默认会保存最后一次训练的参数。\n", + "- `SummaryStep`:对Tensor值进行监控。此功能会在MindData平台训练脚本中使用。\n", + "- `LossMonitor`:监控loss值,当loss值为Nan或Inf时停止训练。此功能会在MindData平台训练脚本中使用。\n", + "\n", + "`ModelCheckpoint`用于保存模型和参数,如每个epoch结束时,都保存一次checkpoint。\n", + "\n", + "1. 首先需要初始化一个`CheckpointConfig`类对象,用以声明保存策略。调用方法如:\n", + " \n", + " ```py\n", + " CheckpointConfig(save_checkpoint_steps=1, keep_checkpoint_max=5)\n", + " ```\n", + " \n", + " 参数说明:\n", + " \n", + " - `save_checkpoint_steps`:每多少step保存一个checkpoint文件,单位为step;\n", + " - `keep_checkpoint_max`:最多保留checkpoint文件的数量(按最新的文件)。\n", + "\n", + "2. 创建`ModelCheckpoint`对象。调用方法如:\n", + " \n", + " ```py\n", + " ModelCheckpoint(prefix=DEFAULT_CHECKPOINT_PREFIX_NAME, config=None)\n", + " ```\n", + " \n", + " 参数说明:\n", + " \n", + " - `prefix`:保存的文件前缀名,如'ck_lenet'。\n", + " - `config`:配置策略信息,传入上文创建的CheckpointConfig对象。\n", + "\n", + "> `ModelCheckpoint`会生成和保存模型(.pkl)和Chekpoint(.ckpt)文件。" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch: 1 step: 1875 ,loss is 2.3151364\n", + "epoch: 2 step: 1875 ,loss is 0.3097728\n", + "Metrics: {'acc': 0.9417067307692307, 'loss': 0.18866610953894755}\n", + "b_lenet-1_1875.ckpt\n", + "b_lenet-2_1875.ckpt\n" + ] + } + ], + "source": [ + "os.system('rm -f *.ckpt *.ir *.meta') # 清理旧的运行文件\n", + "LOOP_SINK = context.get_context('enable_loop_sink')\n", + "\n", + "def test_train(lr=0.01, momentum=0.9, num_epoch=2, check_point_name=\"b_lenet\"):\n", + " ds_train = create_dataset(num_epoch=num_epoch)\n", + " ds_eval = create_dataset(training=False)\n", + " steps_per_epoch = ds_train.get_dataset_size()\n", + " \n", + " net = LeNet()\n", + " loss = nn.loss.SoftmaxCrossEntropyWithLogits(is_grad=False, sparse=True, reduction='mean')\n", + " opt = nn.Momentum(net.trainable_params(), lr, momentum)\n", + " \n", + " ckpt_cfg = CheckpointConfig(save_checkpoint_steps=steps_per_epoch, keep_checkpoint_max=5)\n", + " ckpt_cb = ModelCheckpoint(prefix=check_point_name, config=ckpt_cfg)\n", + " loss_cb = LossMonitor(per_print_times=1 if LOOP_SINK else steps_per_epoch)\n", + " \n", + " model = Model(net, loss, opt, metrics={'acc', 'loss'})\n", + " model.train(num_epoch, ds_train, callbacks=[ckpt_cb, loss_cb], dataset_sink_mode=True)\n", + " metrics = model.eval(ds_eval)\n", + " print('Metrics:', metrics)\n", + "\n", + "test_train()\n", + "print('\\n'.join(sorted([x for x in os.listdir('.') if x.startswith('b_lenet')])))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 加载Checkpoint继续训练\n", + "\n", + "模型训练过程偶尔会中断,可以通过加载Checkpoint文件继续训练。\n", + "\n", + "1. 读取Checkpoint文件,调用方法如:\n", + " \n", + " ```py\n", + " load_checkpoint(ckpoint_file_name)\n", + " ```\n", + " \n", + " 参数说明:\n", + " \n", + " - `ckpoint_file_name`:checkpoint文件名,如'ck_lenet-7_1875.ckpt'。\n", + " - 返回值:返回一个字典。key为参数name,value为parameter类型的实例。\n", + "\n", + "2. 加载参数后继续训练,调用方法如:\n", + " \n", + " ```py\n", + " load_param_into_net(net, param_dict)\n", + " ```\n", + " \n", + " 参数说明:\n", + " \n", + " - `net`:初始不带优化器和损失函数的网络,如:`Resnet()`。\n", + " - `param_dict`:加载checkpoint文件后生成的字典。\n", + "\n", + "> 使用load_checkpoint接口加载数据时,需要把数据传入给原始网络,而不能传递给带有优化器和损失函数的训练网络。" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch: 1 step: 1875 ,loss is 0.1638589\n", + "epoch: 2 step: 1875 ,loss is 0.060048036\n", + "Metrics: {'acc': 0.9742588141025641, 'loss': 0.07910804035148034}\n", + "b_lenet_1-1_1875.ckpt\n", + "b_lenet_1-2_1875.ckpt\n" + ] + } + ], + "source": [ + "CKPT = 'b_lenet-2_1875.ckpt'\n", + "\n", + "def resume_train(lr=0.001, momentum=0.9, num_epoch=2, ckpt_name=\"b_lenet\"):\n", + " ds_train = create_dataset(num_epoch=num_epoch)\n", + " ds_eval = create_dataset(training=False)\n", + " steps_per_epoch = ds_train.get_dataset_size()\n", + " \n", + " net = LeNet()\n", + " loss = nn.loss.SoftmaxCrossEntropyWithLogits(is_grad=False, sparse=True, reduction='mean')\n", + " opt = nn.Momentum(net.trainable_params(), lr, momentum)\n", + " \n", + " param_dict = load_checkpoint(CKPT)\n", + " load_param_into_net(net, param_dict)\n", + " load_param_into_net(opt, param_dict)\n", + " \n", + " ckpt_cfg = CheckpointConfig(save_checkpoint_steps=steps_per_epoch, keep_checkpoint_max=5)\n", + " ckpt_cb = ModelCheckpoint(prefix=ckpt_name, config=ckpt_cfg)\n", + " loss_cb = LossMonitor(per_print_times=1 if LOOP_SINK else steps_per_epoch)\n", + " \n", + " model = Model(net, loss, opt, metrics={'acc', 'loss'})\n", + " model.train(num_epoch, ds_train, callbacks=[ckpt_cb, loss_cb])\n", + " \n", + " metrics = model.eval(ds_eval)\n", + " print('Metrics:', metrics)\n", + "\n", + "resume_train()\n", + "print('\\n'.join(sorted([x for x in os.listdir('.') if x.startswith('b_lenet')])))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 推理\n", + "\n", + "加载Checkpoint,并执行验证。读取模型和Checkpoint文件,调用方法如:\n", + " \n", + " ```py\n", + " load(model_file_name, ckpoint_file_name)\n", + " ```\n", + " \n", + " 参数说明:\n", + " \n", + " - `model_file_name`:模型文件名,如'ck_lenet-model.pkl'。\n", + " - `ckpoint_file_name`:checkpoint文件名,如'ck_lenet-7_1875.ckpt'。\n", + " \n", + "使用matplotlib定义一个将推理结果可视化的辅助函数,如下:" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "def plot_images(pred_fn, ds, net):\n", + " for i in range(1, 5):\n", + " pred, image, label = pred_fn(ds, net)\n", + " plt.subplot(2, 2, i)\n", + " plt.imshow(np.squeeze(image))\n", + " color = 'blue' if pred == label else 'red'\n", + " plt.title(\"prediction: {}, truth: {}\".format(pred, label), color=color)\n", + " plt.xticks([])\n", + " plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "使用训练后的LeNet模型对手写数字进行识别,可以看到识别结果基本上是正确的。" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "CKPT = 'b_lenet_1-2_1875.ckpt'\n", + "\n", + "def infer(ds, model):\n", + " data = ds.get_next()\n", + " images = data['image']\n", + " labels = data['label']\n", + " output = model.predict(Tensor(data['image']))\n", + " pred = np.argmax(output.asnumpy(), axis=1)\n", + " return pred[0], images[0], labels[0]\n", + "\n", + "ds = create_dataset(training=False, batch_size=1).create_dict_iterator()\n", + "net = LeNet()\n", + "param_dict = load_checkpoint(CKPT)\n", + "load_param_into_net(net, param_dict)\n", + "model = Model(net)\n", + "plot_images(infer, ds, model)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 实验步骤(方案二)\n", + "\n", + "### 代码梳理\n", + "\n", + "创建训练作业时,运行参数会通过脚本传参的方式输入给脚本代码,脚本必须解析传参才能在代码中使用相应参数。如data_url和train_url,分别对应数据存储路径(OBS路径)和训练输出路径(OBS路径)。脚本对传参进行解析后赋值到`args`变量里,在后续代码里可以使用。\n", + "\n", + "```python\n", + "import argparse\n", + "parser = argparse.ArgumentParser()\n", + "parser.add_argument('--data_url', required=True, default=None, help='Location of data.')\n", + "parser.add_argument('--train_url', required=True, default=None, help='Location of training outputs.')\n", + "parser.add_argument('--num_epochs', type=int, default=1, help='Number of training epochs.')\n", + "args, unknown = parser.parse_known_args()\n", + "```\n", + "\n", + "MindSpore暂时没有提供直接访问OBS数据的接口,需要通过MoXing提供的API与OBS交互。将OBS中存储的数据拷贝至执行容器:\n", + "\n", + "```python\n", + "import moxing as mox\n", + "mox.file.copy_parallel(src_url=args.data_url, dst_url='MNIST/')\n", + "```\n", + "\n", + "如需将训练输出(如模型Checkpoint)从执行容器拷贝至OBS,请参考:\n", + "\n", + "```python\n", + "import moxing as mox\n", + "mox.file.copy_parallel(src_url='output', dst_url='s3://OBS/PATH')\n", + "```\n", + "\n", + "其他代码分析请参考方案一。\n", + "\n", + "### 创建训练作业\n", + "\n", + "可以参考[使用常用框架训练模型](https://support.huaweicloud.com/engineers-modelarts/modelarts_23_0238.html)来创建并启动训练作业。\n", + "\n", + "创建训练作业的参考配置:\n", + "\n", + "- 算法来源:常用框架->Ascend-Powered-Engine->MindSpore\n", + "- 代码目录:选择上述新建的OBS桶中的experiment_2目录\n", + "- 启动文件:选择上述新建的OBS桶中的experiment_2目录下的`main.py`\n", + "- 数据来源:数据存储位置->选择上述新建的OBS桶中的experiment_1文件夹下的MNIST目录\n", + "- 训练输出位置:选择上述新建的OBS桶中的experiment_1目录并在其中创建output目录\n", + "- 作业日志路径:同训练输出位置\n", + "- 规格:Ascend:1*Ascend 910\n", + "- 其他均为默认\n", + "\n", + "启动并查看训练过程:\n", + "\n", + "1. 点击提交以开始训练;\n", + "2. 在训练作业列表里可以看到刚创建的训练作业,在训练作业页面可以看到版本管理;\n", + "3. 点击运行中的训练作业,在展开的窗口中可以查看作业配置信息,以及训练过程中的日志,日志会不断刷新,等训练作业完成后也可以下载日志到本地进行查看;\n", + "4. 在训练日志中可以看到`epoch: 3 step: 1875 ,loss is 0.025683485`等字段,即训练过程的loss值;\n", + "5. 在训练日志中可以看到`Metrics: {'acc': 0.9742588141025641, 'loss': 0.08628832848253062}`等字段,即训练完成后的验证精度;\n", + "6. 在训练日志里可以看到`b_lenet_1-2_1875.ckpt`等字段,即训练过程保存的Checkpoint。" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 实验小结\n", + "\n", + "本实验展示了MindSpore的Checkpoint、断点继续训练等高级特性:\n", + "1. 使用MindSpore的ModelCheckpoint接口每个epoch保存一次Checkpoint,训练2个epoch并终止。\n", + "2. 使用MindSpore的load_checkpoint和load_param_into_net接口加载上一步保存的Checkpoint继续训练2个epoch。\n", + "3. 观察训练过程中Loss的变化情况,加载Checkpoint继续训练后loss进一步下降。" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.5" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/experiment_2/main.py b/experiment_2/main.py new file mode 100644 index 0000000..9e5c98b --- /dev/null +++ b/experiment_2/main.py @@ -0,0 +1,142 @@ +# Save and load model + +import os +# os.environ['DEVICE_ID'] = '0' +# Log level includes 3(ERROR), 2(WARNING), 1(INFO), 0(DEBUG). +os.environ['GLOG_v'] = '2' + +import matplotlib.pyplot as plt +import numpy as np + +import mindspore as ms +import mindspore.context as context +import mindspore.dataset.transforms.c_transforms as C +import mindspore.dataset.transforms.vision.c_transforms as CV + +from mindspore.dataset.transforms.vision import Inter +from mindspore import nn, Tensor +from mindspore.train import Model +from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor +from mindspore.train.serialization import load_checkpoint, load_param_into_net + +import logging; logging.getLogger('matplotlib.font_manager').disabled = True + +context.set_context(mode=context.GRAPH_MODE, device_target='Ascend') + +DATA_DIR_TRAIN = "MNIST/train" # 训练集信息 +DATA_DIR_TEST = "MNIST/test" # 测试集信息 + + +def create_dataset(training=True, num_epoch=1, batch_size=32, resize=(32, 32), + rescale=1/(255*0.3081), shift=-0.1307/0.3081, buffer_size=64): + ds = ms.dataset.MnistDataset(DATA_DIR_TRAIN if training else DATA_DIR_TEST) + + # define map operations + resize_op = CV.Resize(resize) + rescale_op = CV.Rescale(rescale, shift) + hwc2chw_op = CV.HWC2CHW() + + # apply map operations on images + ds = ds.map(input_columns="image", operations=[resize_op, rescale_op, hwc2chw_op]) + ds = ds.map(input_columns="label", operations=C.TypeCast(ms.int32)) + + ds = ds.shuffle(buffer_size=buffer_size) + ds = ds.batch(batch_size, drop_remainder=True) + ds = ds.repeat(num_epoch) + + return ds + + +class LeNet(nn.Cell): + def __init__(self): + super(LeNet, self).__init__() + self.relu = nn.ReLU() + self.conv1 = nn.Conv2d(1, 6, 5, stride=1, pad_mode='valid') + self.conv2 = nn.Conv2d(6, 16, 5, stride=1, pad_mode='valid') + self.pool = nn.MaxPool2d(kernel_size=2, stride=2) + self.flatten = nn.Flatten() + self.fc1 = nn.Dense(400, 120) + self.fc2 = nn.Dense(120, 84) + self.fc3 = nn.Dense(84, 10) + + def construct(self, input_x): + output = self.conv1(input_x) + output = self.relu(output) + output = self.pool(output) + output = self.conv2(output) + output = self.relu(output) + output = self.pool(output) + output = self.flatten(output) + output = self.fc1(output) + output = self.fc2(output) + output = self.fc3(output) + + return output + + +LOOP_SINK = context.get_context('enable_loop_sink') + +def test_train(lr=0.01, momentum=0.9, num_epoch=2, check_point_name="b_lenet"): + ds_train = create_dataset(num_epoch=num_epoch) + ds_eval = create_dataset(training=False) + steps_per_epoch = ds_train.get_dataset_size() + + net = LeNet() + loss = nn.loss.SoftmaxCrossEntropyWithLogits(is_grad=False, sparse=True, reduction='mean') + opt = nn.Momentum(net.trainable_params(), lr, momentum) + + ckpt_cfg = CheckpointConfig(save_checkpoint_steps=steps_per_epoch, keep_checkpoint_max=5) + ckpt_cb = ModelCheckpoint(prefix=check_point_name, config=ckpt_cfg) + loss_cb = LossMonitor(per_print_times=1 if LOOP_SINK else steps_per_epoch) + + model = Model(net, loss, opt, metrics={'acc', 'loss'}) + model.train(num_epoch, ds_train, callbacks=[ckpt_cb, loss_cb], dataset_sink_mode=True) + metrics = model.eval(ds_eval) + print('Metrics:', metrics) + + +CKPT = 'b_lenet-2_1875.ckpt' + +def resume_train(lr=0.001, momentum=0.9, num_epoch=2, ckpt_name="b_lenet"): + ds_train = create_dataset(num_epoch=num_epoch) + ds_eval = create_dataset(training=False) + steps_per_epoch = ds_train.get_dataset_size() + + net = LeNet() + loss = nn.loss.SoftmaxCrossEntropyWithLogits(is_grad=False, sparse=True, reduction='mean') + opt = nn.Momentum(net.trainable_params(), lr, momentum) + + param_dict = load_checkpoint(CKPT) + load_param_into_net(net, param_dict) + load_param_into_net(opt, param_dict) + + ckpt_cfg = CheckpointConfig(save_checkpoint_steps=steps_per_epoch, keep_checkpoint_max=5) + ckpt_cb = ModelCheckpoint(prefix=ckpt_name, config=ckpt_cfg) + loss_cb = LossMonitor(per_print_times=1 if LOOP_SINK else steps_per_epoch) + + model = Model(net, loss, opt, metrics={'acc', 'loss'}) + model.train(num_epoch, ds_train, callbacks=[ckpt_cb, loss_cb]) + + metrics = model.eval(ds_eval) + print('Metrics:', metrics) + + +if __name__ == "__main__": + import argparse + parser = argparse.ArgumentParser() + parser.add_argument('--data_url', required=True, default=None, help='Location of data.') + parser.add_argument('--train_url', required=True, default=None, help='Location of training outputs.') + parser.add_argument('--num_epochs', type=int, default=1, help='Number of training epochs.') + args, unknown = parser.parse_known_args() + + import moxing as mox + mox.file.copy_parallel(src_url=args.data_url, dst_url='MNIST/') + + os.system('rm -f *.ckpt *.ir *.meta') # 清理旧的运行文件 + + test_train() + print('\n'.join(sorted([x for x in os.listdir('.') if x.startswith('b_lenet')]))) + + resume_train() + print('\n'.join(sorted([x for x in os.listdir('.') if x.startswith('b_lenet')]))) + \ No newline at end of file diff --git a/experiment_3/3-Computer_Vision.md b/experiment_3/3-Computer_Vision.md new file mode 100644 index 0000000..0030994 --- /dev/null +++ b/experiment_3/3-Computer_Vision.md @@ -0,0 +1,344 @@ +

计算机视觉应用

+ +[TOC] + +## 实验介绍 + +本实验主要介绍使用MindSpore在CIFAR10数据集上训练ResNet50。本实验建议使用MindSpore model_zoo中提供的ResNet50。 + +## 实验目的 + +- 了解如何使用MindSpore加载常用的CIFAR-10图片分类数据集。 +- 了解MindSpore的model_zoo模块,以及如何使用model_zoo中的模型。 +- 了解ResNet50这类大模型的基本结构和编程方法。 + +## 预备知识 + +- 熟练使用Python,了解Shell及Linux操作系统基本知识。 +- 具备一定的深度学习理论知识,如卷积神经网络、损失函数、优化器,训练策略、Checkpoint等。 +- 了解华为云的基本使用方法,包括[OBS(对象存储)](https://www.huaweicloud.com/product/obs.html)、[ModelArts(AI开发平台)](https://www.huaweicloud.com/product/modelarts.html)、[训练作业](https://support.huaweicloud.com/engineers-modelarts/modelarts_23_0046.html)等功能。华为云官网:https://www.huaweicloud.com +- 了解并熟悉MindSpore AI计算框架,MindSpore官网:https://www.mindspore.cn/ + +## 实验环境 + +- MindSpore 0.2.0(MindSpore版本会定期更新,本指导也会定期刷新,与版本配套); +- 华为云ModelArts:ModelArts是华为云提供的面向开发者的一站式AI开发平台,集成了昇腾AI处理器资源池,用户可以在该平台下体验MindSpore。ModelArts官网:https://www.huaweicloud.com/product/modelarts.html + +## 实验准备 + +### 创建OBS桶 + +本实验需要使用华为云OBS存储脚本和数据集,可以参考[快速通过OBS控制台上传下载文件](https://support.huaweicloud.com/qs-obs/obs_qs_0001.html)了解使用OBS创建桶、上传文件、下载文件的使用方法。 + +> **提示:**华为云新用户使用OBS时通常需要创建和配置“访问密钥”,可以在使用OBS时根据提示完成创建和配置。也可以参考[获取访问密钥并完成ModelArts全局配置](https://support.huaweicloud.com/prepare-modelarts/modelarts_08_0002.html)获取并配置访问密钥。 + +创建OBS桶的参考配置如下: + +- 区域:华北-北京四 +- 数据冗余存储策略:单AZ存储 +- 桶名称:如ms-course +- 存储类别:标准存储 +- 桶策略:公共读 +- 归档数据直读:关闭 +- 企业项目、标签等配置:免 + +### 数据集准备 + +CIFAR-10是一个图片分类数据集,包含60000张32x32的彩色物体图片,训练集50000张,测试集10000张,共10类,每类6000张。CIFAR-10数据集的官网:[THE MNIST DATABASE](http://www.cs.toronto.edu/~kriz/cifar.html)。 + +从CIFAR-10官网下载“CIFAR-10 binary version (suitable for C programs)”到本地并解压。 + +### 脚本准备 + +从[MindSpore tutorial仓库](https://gitee.com/mindspore/docs/tree/r0.2/tutorials/tutorial_code/sample_for_cloud/)里下载相关脚本。 + +### 上传文件 + +将脚本和数据集上传到OBS桶中,组织为如下形式: + +``` +experiment_3 +├── 脚本等文件 +└── cifar10 + ├── batches.meta.txt + ├── test + │   └── test_batch.bin + └── train + ├── data_batch_1.bin + ├── data_batch_2.bin + ├── data_batch_3.bin + ├── data_batch_4.bin + └── data_batch_5.bin +``` + +## 实验步骤 + +参考MindSpore官网[计算机视觉应用](https://www.mindspore.cn/tutorial/zh-CN/0.1.0-alpha/advanced_use/computer_vision_application.html)教程,使用MindSpore在CIFAR10数据集上训练ResNet50,并进行验证。建议: + +- 使用单卡训练即可; +- 理解并熟悉教程中涉及的源码; +- 使用MindSpore model_zoo中提供的ResNet50。 + +### 代码梳理 + +- resnet50_train.py:主脚本,包含性能测试`PerformanceCallback`、动态学习率`get_lr`、执行函数`resnet50_train`等函数; +- dataset.py:数据处理脚本。 + +`PerformanceCallback`继承MindSpore Callback类,并统计每个训练step的时延: + +```python +class PerformanceCallback(Callback): + """ + Training performance callback. + + Args: + batch_size (int): Batch number for one step. + """ + def __init__(self, batch_size): + super(PerformanceCallback, self).__init__() + self.batch_size = batch_size + self.last_step = 0 + self.epoch_begin_time = 0 + + def step_begin(self, run_context): + self.epoch_begin_time = time.time() + + def step_end(self, run_context): + params = run_context.original_args() + cost_time = time.time() - self.epoch_begin_time + train_steps = params.cur_step_num -self.last_step + print(f'epoch {params.cur_epoch_num} cost time = {cost_time}, train step num: {train_steps}, ' + f'one step time: {1000*cost_time/train_steps} ms, ' + f'train samples per second of cluster: {device_num*train_steps*self.batch_size/cost_time:.1f}\n') + self.last_step = run_context.original_args().cur_step_num +``` + +`get_lr`生成学习率数组,其中每个元素对应每个step的学习率,这里学习率下降采用二次曲线的形式: + +```python +def get_lr(global_step, + total_epochs, + steps_per_epoch, + lr_init=0.01, + lr_max=0.1, + warmup_epochs=5): + """ + Generate learning rate array. + + Args: + global_step (int): Initial step of training. + total_epochs (int): Total epoch of training. + steps_per_epoch (float): Steps of one epoch. + lr_init (float): Initial learning rate. Default: 0.01. + lr_max (float): Maximum learning rate. Default: 0.1. + warmup_epochs (int): The number of warming up epochs. Default: 5. + + Returns: + np.array, learning rate array. + """ + lr_each_step = [] + total_steps = steps_per_epoch * total_epochs + warmup_steps = steps_per_epoch * warmup_epochs + if warmup_steps != 0: + inc_each_step = (float(lr_max) - float(lr_init)) / float(warmup_steps) + else: + inc_each_step = 0 + for i in range(int(total_steps)): + if i < warmup_steps: + lr = float(lr_init) + inc_each_step * float(i) + else: + base = ( 1.0 - (float(i) - float(warmup_steps)) / (float(total_steps) - float(warmup_steps)) ) + lr = float(lr_max) * base * base + if lr < 0.0: + lr = 0.0 + lr_each_step.append(lr) + + current_step = global_step + lr_each_step = np.array(lr_each_step).astype(np.float32) + learning_rate = lr_each_step[current_step:] + + return learning_rate +``` + +MindSpore支持直接读取cifar10数据集: + +```python +if device_num == 1 or not do_train: + ds = de.Cifar10Dataset(dataset_path, num_parallel_workers=8, shuffle=do_shuffle) +else: + ds = de.Cifar10Dataset(dataset_path, num_parallel_workers=8, shuffle=do_shuffle, + num_shards=device_num, shard_id=device_id) +``` + +导入并使用model_zoo里的resnet50模型: + +```python +from mindspore.model_zoo.resnet import resnet50 +# create model +net = resnet50(class_num = class_num) +``` + +使用数据增强,如随机裁剪、随机水平反转: + +```python +# define map operations +random_crop_op = C.RandomCrop((32, 32), (4, 4, 4, 4)) +random_horizontal_flip_op = C.RandomHorizontalFlip(device_id / (device_id + 1)) +``` + +`model_zoo.resnet`中resnet50定义如下: + +```python +def resnet50(class_num=10): + return ResNet(ResidualBlock, + [3, 4, 6, 3], + [64, 256, 512, 1024], + [256, 512, 1024, 2048], + [1, 2, 2, 2], + class_num) +``` + +ResNet类定义如下: + +```python +class ResNet(nn.Cell): + """ + ResNet architecture. + + Args: + block (Cell): Block for network. + layer_nums (list): Numbers of block in different layers. + in_channels (list): Input channel in each layer. + out_channels (list): Output channel in each layer. + strides (list): Stride size in each layer. + num_classes (int): The number of classes that the training images are belonging to. + Returns: + Tensor, output tensor. + + Examples: + >>> ResNet(ResidualBlock, + >>> [3, 4, 6, 3], + >>> [64, 256, 512, 1024], + >>> [256, 512, 1024, 2048], + >>> [1, 2, 2, 2], + >>> 10) + """ +``` + +ResNet的不同版本均由5个阶段(stage)组成,其中ResNet50结构为Convx1 -> ResidualBlockx3 -> ResidualBlockx4 -> ResidualBlockx6 -> ResidualBlockx5 -> Pooling+FC。 + +`ResidualBlock`为残差模块,相比传统卷积多了一个short-cut支路,用于将浅层的信息直接传递到深层,使得网络可以很深,而不会出现训练时梯度消失/爆炸的问题: + +```python +class ResidualBlock(nn.Cell): + expansion = 4 + + def __init__(self, + in_channel, + out_channel, + stride=1): + super(ResidualBlock, self).__init__() + + channel = out_channel // self.expansion + self.conv1 = _conv1x1(in_channel, channel, stride=1) + self.bn1 = _bn(channel) + + self.conv2 = _conv3x3(channel, channel, stride=stride) + self.bn2 = _bn(channel) + + self.conv3 = _conv1x1(channel, out_channel, stride=1) + self.bn3 = _bn_last(out_channel) + + self.relu = nn.ReLU() + + # 如果in + self.down_sample = False + if stride != 1 or in_channel != out_channel: + self.down_sample = True + self.down_sample_layer = None + if self.down_sample: + self.down_sample_layer = nn.SequentialCell([_conv1x1(in_channel, out_channel, stride), + _bn(out_channel)]) + self.add = P.TensorAdd() + + def construct(self, x): + identity = x + + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + + out = self.conv2(out) + out = self.bn2(out) + out = self.relu(out) + + out = self.conv3(out) + out = self.bn3(out) + + if self.down_sample: + identity = self.down_sample_layer(identity) + + # output为残差支路,identity为short-cut支路 + out = self.add(out, identity) + out = self.relu(out) + + return out +``` + +创建训练作业时,运行参数会通过脚本传参的方式输入给脚本代码,脚本必须解析传参才能在代码中使用相应参数。如data_url和train_url,分别对应数据存储路径(OBS路径)和训练输出路径(OBS路径)。脚本对传参进行解析后赋值到`args`变量里,在后续代码里可以使用。 + +```python +import argparse +parser = argparse.ArgumentParser() +parser.add_argument('--data_url', required=True, default=None, help='Location of data.') +parser.add_argument('--train_url', required=True, default=None, help='Location of training outputs.') +parser.add_argument('--num_epochs', type=int, default=1, help='Number of training epochs.') +args, unknown = parser.parse_known_args() +``` + +MindSpore暂时没有提供直接访问OBS数据的接口,需要通过MoXing提供的API与OBS交互。将OBS中存储的数据拷贝至执行容器: + +```python +import moxing as mox +mox.file.copy_parallel(src_url=args.data_url, dst_url='cifar10/') +``` + +如需将训练输出(如模型Checkpoint)从执行容器拷贝至OBS,请参考: + +```python +import moxing as mox +mox.file.copy_parallel(src_url='output', dst_url='s3://OBS/PATH') +``` + +### 创建训练作业 + +可以参考[使用常用框架训练模型](https://support.huaweicloud.com/engineers-modelarts/modelarts_23_0238.html)来创建并启动训练作业。 + +创建训练作业的参考配置: + +- 算法来源:常用框架->Ascend-Powered-Engine->MindSpore +- 代码目录:选择上述新建的OBS桶中的experiment_3目录 +- 启动文件:选择上述新建的OBS桶中的experiment_3目录下的`resnet50_train.py` +- 数据来源:数据存储位置->选择上述新建的OBS桶中的experiment_3文件夹下的cifar10目录 +- 训练输出位置:选择上述新建的OBS桶中的experiment_3目录并在其中创建output目录 +- 作业日志路径:同训练输出位置 +- 规格:Ascend:1*Ascend 910 +- 其他均为默认 + +启动并查看训练过程: + +1. 点击提交以开始训练; +2. 在训练作业列表里可以看到刚创建的训练作业,在训练作业页面可以看到版本管理; +3. 点击运行中的训练作业,在展开的窗口中可以查看作业配置信息,以及训练过程中的日志,日志会不断刷新,等训练作业完成后也可以下载日志到本地进行查看; +4. 在训练日志中可以看到`epoch 90 cost time = 27.963477849960327, train step num: 1562, one step time: 17.90235457743939 ms, train samples per second of cluster: 1787.5`等字段,即训练过程的性能数据; +5. 在训练日志中可以看到`epoch: 90 step: 1562, loss is 0.00250402`等字段,即训练过程的loss数据; +6. 在训练日志里可以看到`Evaluation result: {'acc': 0.9182692307692307}.`字段,即训练完成后的验证精度。 + +## 实验结论 + +本实验主要介绍使用MindSpore在CIFAR10数据集上训练ResNet50,了解了以下知识点: + +- 性能测试 +- 动态学习率 +- model_zoo:resnet50 +- cifar10数据集、数据增强 diff --git a/experiment_3/dataset.py b/experiment_3/dataset.py new file mode 100644 index 0000000..8896e06 --- /dev/null +++ b/experiment_3/dataset.py @@ -0,0 +1,86 @@ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""Create train or eval dataset.""" +import os +import mindspore.common.dtype as mstype +import mindspore.dataset.engine as de +import mindspore.dataset.transforms.vision.c_transforms as C +import mindspore.dataset.transforms.c_transforms as C2 + + +device_id = int(os.getenv('DEVICE_ID')) +device_num = int(os.getenv('RANK_SIZE')) + + +def create_dataset(dataset_path, do_train, repeat_num=1, batch_size=32): + """ + Create a train or eval dataset. + + Args: + dataset_path (str): The path of dataset. + do_train (bool): Whether dataset is used for train or eval. + repeat_num (int): The repeat times of dataset. Default: 1. + batch_size (int): The batch size of dataset. Default: 32. + + Returns: + Dataset. + """ + if do_train: + dataset_path = os.path.join(dataset_path, 'train') + do_shuffle = True + else: + dataset_path = os.path.join(dataset_path, 'eval') + do_shuffle = False + + if device_num == 1 or not do_train: + ds = de.Cifar10Dataset(dataset_path, num_parallel_workers=8, shuffle=do_shuffle) + else: + ds = de.Cifar10Dataset(dataset_path, num_parallel_workers=8, shuffle=do_shuffle, + num_shards=device_num, shard_id=device_id) + + resize_height = 224 + resize_width = 224 + buffer_size = 100 + rescale = 1.0 / 255.0 + shift = 0.0 + + # define map operations + random_crop_op = C.RandomCrop((32, 32), (4, 4, 4, 4)) + random_horizontal_flip_op = C.RandomHorizontalFlip(device_id / (device_id + 1)) + + resize_op = C.Resize((resize_height, resize_width)) + rescale_op = C.Rescale(rescale, shift) + normalize_op = C.Normalize([0.4914, 0.4822, 0.4465], [0.2023, 0.1994, 0.2010]) + + change_swap_op = C.HWC2CHW() + + trans = [] + if do_train: + trans += [random_crop_op, random_horizontal_flip_op] + + trans += [resize_op, rescale_op, normalize_op, change_swap_op] + + type_cast_op = C2.TypeCast(mstype.int32) + + ds = ds.map(input_columns="label", num_parallel_workers=8, operations=type_cast_op) + ds = ds.map(input_columns="image", num_parallel_workers=8, operations=trans) + + # apply batch operations + ds = ds.batch(batch_size, drop_remainder=True) + + # apply dataset repeat operation + ds = ds.repeat(repeat_num) + + return ds diff --git a/experiment_3/resnet50_train.py b/experiment_3/resnet50_train.py new file mode 100644 index 0000000..0b422a2 --- /dev/null +++ b/experiment_3/resnet50_train.py @@ -0,0 +1,172 @@ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""ResNet50 model train with MindSpore""" +import os +import argparse +import random +import time +import numpy as np +import moxing as mox + +from mindspore import context +from mindspore import Tensor +from mindspore.nn.optim.momentum import Momentum +from mindspore.nn.loss import SoftmaxCrossEntropyWithLogits +from mindspore.train.model import Model, ParallelMode +from mindspore.train.callback import Callback, LossMonitor +from mindspore.train.loss_scale_manager import FixedLossScaleManager +import mindspore.dataset.engine as de + +from dataset import create_dataset, device_id, device_num +from mindspore.model_zoo.resnet import resnet50 + +random.seed(1) +np.random.seed(1) +de.config.set_seed(1) + + +class PerformanceCallback(Callback): + """ + Training performance callback. + + Args: + batch_size (int): Batch number for one step. + """ + def __init__(self, batch_size): + super(PerformanceCallback, self).__init__() + self.batch_size = batch_size + self.last_step = 0 + self.epoch_begin_time = 0 + + def step_begin(self, run_context): + self.epoch_begin_time = time.time() + + def step_end(self, run_context): + params = run_context.original_args() + cost_time = time.time() - self.epoch_begin_time + train_steps = params.cur_step_num -self.last_step + print(f'epoch {params.cur_epoch_num} cost time = {cost_time}, train step num: {train_steps}, ' + f'one step time: {1000*cost_time/train_steps} ms, ' + f'train samples per second of cluster: {device_num*train_steps*self.batch_size/cost_time:.1f}\n') + self.last_step = run_context.original_args().cur_step_num + + +def get_lr(global_step, + total_epochs, + steps_per_epoch, + lr_init=0.01, + lr_max=0.1, + warmup_epochs=5): + """ + Generate learning rate array. + + Args: + global_step (int): Initial step of training. + total_epochs (int): Total epoch of training. + steps_per_epoch (float): Steps of one epoch. + lr_init (float): Initial learning rate. Default: 0.01. + lr_max (float): Maximum learning rate. Default: 0.1. + warmup_epochs (int): The number of warming up epochs. Default: 5. + + Returns: + np.array, learning rate array. + """ + lr_each_step = [] + total_steps = steps_per_epoch * total_epochs + warmup_steps = steps_per_epoch * warmup_epochs + if warmup_steps != 0: + inc_each_step = (float(lr_max) - float(lr_init)) / float(warmup_steps) + else: + inc_each_step = 0 + for i in range(int(total_steps)): + if i < warmup_steps: + lr = float(lr_init) + inc_each_step * float(i) + else: + base = ( 1.0 - (float(i) - float(warmup_steps)) / (float(total_steps) - float(warmup_steps)) ) + lr = float(lr_max) * base * base + if lr < 0.0: + lr = 0.0 + lr_each_step.append(lr) + + current_step = global_step + lr_each_step = np.array(lr_each_step).astype(np.float32) + learning_rate = lr_each_step[current_step:] + + return learning_rate + + +def resnet50_train(args_opt): + epoch_size = args_opt.epoch_size + batch_size = 32 + class_num = 10 + loss_scale_num = 1024 + local_data_path = '/cache/data' + + # set graph mode and parallel mode + context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", save_graphs=False) + context.set_context(enable_task_sink=True, device_id=device_id) + context.set_context(enable_loop_sink=True) + context.set_context(enable_mem_reuse=True) + if device_num > 1: + context.set_auto_parallel_context(device_num=device_num, + parallel_mode=ParallelMode.DATA_PARALLEL, + mirror_mean=True) + local_data_path = os.path.join(local_data_path, str(device_id)) + + # data download + print('Download data.') + mox.file.copy_parallel(src_url=args_opt.data_url, dst_url=local_data_path) + + # create dataset + print('Create train and evaluate dataset.') + train_dataset = create_dataset(dataset_path=local_data_path, do_train=True, + repeat_num=epoch_size, batch_size=batch_size) + eval_dataset = create_dataset(dataset_path=local_data_path, do_train=False, + repeat_num=1, batch_size=batch_size) + train_step_size = train_dataset.get_dataset_size() + print('Create dataset success.') + + # create model + net = resnet50(class_num = class_num) + loss = SoftmaxCrossEntropyWithLogits(sparse=True) + lr = Tensor(get_lr(global_step=0, total_epochs=epoch_size, steps_per_epoch=train_step_size)) + opt = Momentum(net.trainable_params(), lr, momentum=0.9, weight_decay=1e-4, loss_scale=loss_scale_num) + loss_scale = FixedLossScaleManager(loss_scale_num, False) + + model = Model(net, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale, metrics={'acc'}) + + # define performance callback to show ips and loss callback to show loss for every epoch + performance_cb = PerformanceCallback(batch_size) + loss_cb = LossMonitor() + cb = [performance_cb, loss_cb] + + print(f'Start run training, total epoch: {epoch_size}.') + model.train(epoch_size, train_dataset, callbacks=cb) + if device_num == 1 or device_id == 0: + print(f'Start run evaluation.') + output = model.eval(eval_dataset) + print(f'Evaluation result: {output}.') + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='ResNet50 train.') + parser.add_argument('--data_url', required=True, default=None, help='Location of data.') + parser.add_argument('--train_url', required=True, default=None, help='Location of training outputs.') + parser.add_argument('--epoch_size', type=int, default=90, help='Train epoch size.') + + args_opt, unknown = parser.parse_known_args() + + resnet50_train(args_opt) + print('ResNet50 training success!') diff --git a/experiment_4/4-Natural_Language_Processing.md b/experiment_4/4-Natural_Language_Processing.md new file mode 100644 index 0000000..5880d42 --- /dev/null +++ b/experiment_4/4-Natural_Language_Processing.md @@ -0,0 +1,374 @@ +

自然语言处理应用

+ +[TOC] + +## 实验介绍 + +本实验主要介绍使用MindSpore开发和训练[BERT](https://arxiv.org/pdf/1810.04805.pdf)模型。建议先了解MindSpore官网上model_zoo上的BERT模型。 + +## 实验目的 + +- 了解如何使用MindSpore加载常用的NLP数据集。 +- 了解MindSpore的model_zoo模块,以及如何使用model_zoo中的模型。 +- 了解BERT模型的基本结构和编程方法。 + +## 预备知识 + +- 熟练使用Python,了解Shell及Linux操作系统基本知识。 +- 具备一定的深度学习理论知识,如Embedding、Encoder、Decoder、损失函数、优化器,训练策略、Checkpoint等。 +- 了解华为云的基本使用方法,包括[OBS(对象存储)](https://www.huaweicloud.com/product/obs.html)、[ModelArts(AI开发平台)](https://www.huaweicloud.com/product/modelarts.html)、[训练作业](https://support.huaweicloud.com/engineers-modelarts/modelarts_23_0046.html)等功能。华为云官网:https://www.huaweicloud.com +- 了解并熟悉MindSpore AI计算框架,MindSpore官网:https://www.mindspore.cn/ + +## 实验环境 + +- MindSpore 0.2.0(MindSpore版本会定期更新,本指导也会定期刷新,与版本配套); +- 华为云ModelArts:ModelArts是华为云提供的面向开发者的一站式AI开发平台,集成了昇腾AI处理器资源池,用户可以在该平台下体验MindSpore。ModelArts官网:https://www.huaweicloud.com/product/modelarts.html + +## 实验准备 + +### 创建OBS桶 + +本实验需要使用华为云OBS存储脚本和数据集,可以参考[快速通过OBS控制台上传下载文件](https://support.huaweicloud.com/qs-obs/obs_qs_0001.html)了解使用OBS创建桶、上传文件、下载文件的使用方法。 + +> **提示:**华为云新用户使用OBS时通常需要创建和配置“访问密钥”,可以在使用OBS时根据提示完成创建和配置。也可以参考[获取访问密钥并完成ModelArts全局配置](https://support.huaweicloud.com/prepare-modelarts/modelarts_08_0002.html)获取并配置访问密钥。 + +创建OBS桶的参考配置如下: + +- 区域:华北-北京四 +- 数据冗余存储策略:单AZ存储 +- 桶名称:如ms-course +- 存储类别:标准存储 +- 桶策略:公共读 +- 归档数据直读:关闭 +- 企业项目、标签等配置:免 + +### 数据集准备 + +**预训练(pretrain)数据集**:下载[zhwiki数据集](https://dumps.wikimedia.org/zhwiki),使用[WikiExtractor](https://github.com/attardi/wil kiextractor)进行预处理,然后使用[google-research/bert:create_pretraining_data.py](https://github.com/google-research/bert/blob/master/create_pretraining_data.py)将数据转为TFRecord格式; + +zhwiki为中文维基百科数据集,需要将其处理为具有上下文关系的句子对,然后基于词典vocab.txt对每个句子对进行token化,然后存储为特定数据格式(如Json、TFRecord、MindRecord)。 + +**微调(finetune)数据集**:使用[CLUEbenchmark/CLUEPretrainedModels中的脚本](https://github.com/CLUEbenchmark/CLUEPretrainedModels/blob/master/baselines/models/bert/run_classifier_tnews.sh)下载、处理TNEWS数据集,并将数据转为TFRecord格式。 + +TNEWS为今日头条中文新闻(短文本)分类(Short Text Classificaiton for News)数据集。该数据集来自今日头条的新闻版块,共提取了15个类别的新闻,包括旅游,教育,金融,军事等。数据量:训练集(53,360),验证集(10,000),测试集(10,000)。例子: + +{"label": "102", "label_des": "news_entertainment", "sentence": "江疏影甜甜圈自拍,迷之角度竟这么好看,美吸引一切事物"} + +每一条数据有三个属性,从前往后分别是 分类ID,分类名称,新闻字符串(仅含标题)。 + +本实验不进行数据预处理,请从网盘下载zhwiki_part和tnews数据集: + +链接: https://pan.baidu.com/s/1F2S9Wr-ND0LMfATjv7WEug 提取码: gent + +### 脚本准备 + +从[课程gitee仓库](https://gitee.com/mindspore/course)上下载本实验相关脚本。其中`tokenization.py`来源于[google-research/bert](https://github.com/google-research/bert/blob/master/tokenization.py) + +### 上传文件 + +将脚本和数据集上传到OBS桶中,组织为如下形式: + +``` +experiment_4 +├── 脚本等文件 +├── tnews +│   ├── bert_base.ckpt +│   ├── dev.tf_record +│   ├── dev_schema.json +│   ├── label2id.json +│   ├── train.tf_record +│   ├── train_schema.json +│   └── vocab.txt +└── zhwiki_part + ├── schema.json + └── part.tfrecord +``` + +## 实验步骤 + +参考MindSpore开源仓库[BERT example](https://gitee.com/mindspore/mindspore/tree/r0.2/example/Bert_NEZHA_cnwiki)示例,并进行实验。 + +BERT(Bidirectional Encoder Representations from Transformers),即基于Transformer的双向编码表征。其中: + +- Transformer是一种注意力(Attention)机制,用来学习文本中单词上下文之间的关系; +- 双向是指通过Masked Language Model(MLM)方法,随机的掩盖掉句子中的某些单词,然后利用前后未掩盖的信息来预测掩盖的单词; + +更多BERT的介绍可以参考[Link](https://www.jianshu.com/p/d110d0c13063) + +### 预训练BERT模型 + +[BERT](https://github.com/google-research/bert)模型包含由不同隐含层数(number hidden layers)和隐含层单元数(hidden size)构成的不同版本。通常情况下使用Bert需要预训练(pretrain)和微调(fine-tune)两个阶段。预训练BERT模型通常需要在大数据集上多卡并行训练多天。本实验先以部分zhwiki数据集为例展示预训练的过程。 + +BERT预训练阶段包含两个任务(两个输出): + +- Mask语言模型(Mask LM):预测被掩盖掉(mask)的单词; +- NextSentence预测(NSP):判断句子对是否具有上下文关系,即句子B是否时句子A的下一句。 + +### 代码梳理 + +model_zoo:Bert_NEZHA中包含两个模块: + +- `bert_for_pre_training.py`:包含`GetMaskedLMOutput`, `GetNextSentenceOutput`, `BertPreTraining`, `BertPretrainingLoss`, `BertNetworkWithLoss`, `BertTrainOneStepCell`, `BertTrainOneStepWithLossScaleCell`; +- `bert_model.py`:包含`BertModel`依赖的 + +`GetMaskedLMOutput`接在BERT基础模型的后面,用于获取Mask LM的输出, + +`GetNextSentenceOutput`在BERT基础模型的后面接了一个全连接层和Softmax层,用于获取NSP的输出。 + +```python +class GetNextSentenceOutput(nn.Cell): + def construct(self, input_tensor): + logits = self.dense(input_tensor) + logits = self.cast(logits, self.dtype) + log_prob = self.log_softmax(logits) + return log_prob +``` + +`BertPreTraining`将Mask LM模型和NSP模型封装成一个模型定义,`BertPretrainingLoss`将Mask LM Loss和NSP Loss加和封装为一个Loss定义。`BertNetworkWithLoss`根据模型输出计算Loss值。 + +```python +class BertNetworkWithLoss(nn.Cell): + """ + Provide bert pre-training loss through network. + + Args: + config (BertConfig): The config of BertModel. + is_training (bool): Specifies whether to use the training mode. + use_one_hot_embeddings (bool): Specifies whether to use one-hot for embeddings. Default: False. + + Returns: + Tensor, the loss of the network. + """ + def __init__(self, config, is_training, use_one_hot_embeddings=False): + super(BertNetworkWithLoss, self).__init__() + self.bert = BertPreTraining(config, is_training, use_one_hot_embeddings) + self.loss = BertPretrainingLoss(config) + self.cast = P.Cast() + + def construct(self, + input_ids, + input_mask, + token_type_id, + next_sentence_labels, + masked_lm_positions, + masked_lm_ids, + masked_lm_weights): + prediction_scores, seq_relationship_score = \ + self.bert(input_ids, input_mask, token_type_id, masked_lm_positions) + total_loss = self.loss(prediction_scores, seq_relationship_score, + masked_lm_ids, masked_lm_weights, next_sentence_labels) + return self.cast(total_loss, mstype.float32) +``` + +`BertTrainOneStepCell`在`BertNetworkWithLoss`上加上了反向传播和梯度更新(优化器),接收数据输入,更新模型权重。`BertTrainOneStepWithLossScaleCell`在此基础上引入了损失缩放(Loss Scaling)。损失缩放是为了应对反向传播过程中梯度数值较小,计算时(如采用FP16)会被当做0处理,所以先对Loss做一个放大,然后再对梯度进行缩小。 + +`bert_model.py`中`BertModel`接收数据输入,经过`EmbeddingLookup`, `EmbeddingPostprocessor`, `BertTransformer`和`Dense`计算后得到输出。 + +![BERT Model](https://www.lyrn.ai/wp-content/uploads/2018/11/transformer.png) + +[1] 图片来源于https://www.lyrn.ai + +```python +class BertModel(nn.Cell): + def construct(self, input_ids, token_type_ids, input_mask): + # embedding + if not self.token_type_ids_from_dataset: + token_type_ids = self.token_type_ids + word_embeddings, embedding_tables = self.bert_embedding_lookup(input_ids) + embedding_output = self.bert_embedding_postprocessor(token_type_ids, + word_embeddings) + + # attention mask [batch_size, seq_length, seq_length] + attention_mask = self._create_attention_mask_from_input_mask(input_mask) + + # bert encoder + encoder_output = self.bert_encoder(self.cast_compute_type(embedding_output), + attention_mask) + + sequence_output = self.cast(encoder_output[self.last_idx], self.dtype) + + # pooler + sequence_slice = self.slice(sequence_output, + (0, 0, 0), + (self.batch_size, 1, self.hidden_size), + (1, 1, 1)) + first_token = self.squeeze_1(sequence_slice) + pooled_output = self.dense(first_token) + pooled_output = self.cast(pooled_output, self.dtype) + + return sequence_output, pooled_output, embedding_tables +``` + +`EmbeddingLookup`和`EmbeddingPostprocessor`用于将输入转换成Embedding张量,Embedding如下图所示: + +![Embedding](https://www.lyrn.ai/wp-content/uploads/2018/11/NSP.png) + +[2] 图片来源于https://www.lyrn.ai 和https://arxiv.org/pdf/1810.04805.pdf + +`BertTransformer`采用了下图中[Transformer](https://arxiv.org/pdf/1706.03762.pdf)中的encoder部分(左侧半边),包含`BertAttention->BertSelfAttention->BertEncoderCell`。 + +![Transformer](https://pic2.zhimg.com/80/v2-0e85f4d440e621803d11408b39834dd1_720w.jpg) + +[3] 图片来源于https://zhuanlan.zhihu.com/p/34781297 和https://arxiv.org/pdf/1706.03762.pdf + +`BertAttention`为Multi-Head Attention: + +![Multi-Head Attention](https://pic3.zhimg.com/80/v2-58d60594bc3e9cbe47faec82ef29fd76_720w.jpg) +[4] 图片来源于https://zhuanlan.zhihu.com/p/34781297 和https://arxiv.org/pdf/1706.03762.pdf + +创建训练作业时,运行参数会通过脚本传参的方式输入给脚本代码,脚本必须解析传参才能在代码中使用相应参数。如data_url和train_url,分别对应数据存储路径(OBS路径)和训练输出路径(OBS路径)。脚本对传参进行解析后赋值到`args`变量里,在后续代码里可以使用。 + +```python +import argparse +parser = argparse.ArgumentParser() +parser.add_argument('--data_url', required=True, default=None, help='Location of data.') +parser.add_argument('--train_url', required=True, default=None, help='Location of training outputs.') +parser.add_argument('--num_epochs', type=int, default=1, help='Number of training epochs.') +args, unknown = parser.parse_known_args() +``` + +MindSpore暂时没有提供直接访问OBS数据的接口,需要通过MoXing提供的API与OBS交互。将OBS中存储的数据拷贝至执行容器: + +```python +import moxing as mox +mox.file.copy_parallel(src_url=args.data_url, dst_url='zhwiki_part/') +``` + +将训练模型Checkpoint从执行容器拷贝至OBS: + +```python +import moxing as mox +mox.file.copy_parallel(src_url='bert_classfication-3_3335.ckpt', + dst_url=os.path.join(args.data_url, 'bert_classfication-3_3335.ckpt')) +``` + +#### 创建训练作业 + +可以参考[使用常用框架训练模型](https://support.huaweicloud.com/engineers-modelarts/modelarts_23_0238.html)来创建并启动训练作业。 + +创建训练作业的参考配置: + +- 算法来源:常用框架->Ascend-Powered-Engine->MindSpore +- 代码目录:选择上述新建的OBS桶中的experiment_4目录 +- 启动文件:选择上述新建的OBS桶中的experiment_4目录下的`pretrain.py` +- 数据来源:数据存储位置->选择上述新建的OBS桶中的experiment_4文件夹下的zhiwiki_part目录 +- 训练输出位置:选择上述新建的OBS桶中的experiment_4目录并在其中创建pretrain_output目录 +- 作业日志路径:同训练输出位置 +- 规格:Ascend:1*Ascend 910 +- 其他均为默认 + +启动并查看训练过程: + +1. 点击提交以开始训练; +2. 在训练作业列表里可以看到刚创建的训练作业,在训练作业页面可以看到版本管理; +3. 点击运行中的训练作业,在展开的窗口中可以查看作业配置信息,以及训练过程中的日志,日志会不断刷新,等训练作业完成后也可以下载日志到本地进行查看; +4. 在训练日志中可以看到`epoch: 10 step: 10, loss is 10.741777`等字段,即预训练过程的loss数据。 + +### 微调BERT + +通常情况下,需要基于与训练的BERT模型在各类细分任务上做微调(finetune),提高BERT在具体任务上的效果。本实验在CLUEbenchmark/CLUE提供的TNEWS数据集上对预训练的BERT做微调,即学习一个短文本分类任务。 + +预训练和微调两种情况下BERT基础模型是相同的,只是最后会在基础模型上加上不同的任务层,用于解决文本分类(新闻分类、情感分类)、序列标注(命名实体识别、问答)等任务。 + +微调BERT依赖如下几个模块: + +- `finetune.py`:包含Loss打印、数据处理、优化器、模型保存等; +- `fintune_config.py`:模型和训练配置; +- `utils.py`模块中定义了finetune需要的模型,包含`BertFinetuneCell`, `BertCLSModel`, `BertNERModel`, `BertCLS`和`BertNER`。 + +`BertFinetuneCell`等同于预训练时的`BertTrainOneStepCell`/`BertTrainOneStepWithLossScaleCell`,接收数据输入,更新模型权重。 + +`BertCLSModel`在BERT基础模型上接了分类任务头: + +```python +class BertCLSModel(nn.Cell): + """ + This class is responsible for classification task evaluation, i.e. XNLI(num_labels=3), + LCQMC(num_labels=2), Chnsenti(num_labels=2). The returned output represents the final + logits as the results of log_softmax is propotional to that of softmax. +``` + +`BertNERModel`在BERT基础模型上接了命名实体识别(NER)任务头: + +```python +class BertNERModel(nn.Cell): + """ + This class is responsible for sequence labeling task evaluation, i.e. NER(num_labels=11). + The returned output represents the final logits as the results of log_softmax is propotional to that of softmax. + """ +``` + +`BertCLS`和`BertNER`在任务模型上接了损失函数,作为`BertFinetuneCell`的输入。 + +#### 创建训练作业 + +可以参考[使用常用框架训练模型](https://support.huaweicloud.com/engineers-modelarts/modelarts_23_0238.html)来创建并启动训练作业。 + +### 代码梳理 + +创建训练作业的参考配置: + +- 算法来源:常用框架->Ascend-Powered-Engine->MindSpore +- 代码目录:选择上述新建的OBS桶中的experiment_4目录 +- 启动文件:选择上述新建的OBS桶中的experiment_4目录下的`fintune.py` +- 数据来源:数据存储位置->选择上述新建的OBS桶中的experiment_4文件夹下的tnews目录 +- 训练输出位置:选择上述新建的OBS桶中的experiment_4目录并在其中创建finetune_output目录 +- 作业日志路径:同训练输出位置 +- 规格:Ascend:1*Ascend 910 +- 其他均为默认 + +启动并查看训练过程: + +1. 点击提交以开始训练,预训练过程约18分钟; +2. 在训练作业列表里可以看到刚创建的训练作业,在训练作业页面可以看到版本管理; +3. 点击运行中的训练作业,在展开的窗口中可以查看作业配置信息,以及训练过程中的日志,日志会不断刷新,等训练作业完成后也可以下载日志到本地进行查看; +4. 在训练日志中可以看到`epoch: 3, step: 10005, outputs are (1.4425085, False)`等字段,即微调过程的输出; + +## 验证BERT + +在TNEWS验证集上对微调后的BERT模型做验证(evaluation)。 + +### 代码梳理 + +验证BERT依赖如下几个模块: + +- `evaluation.py`:包含Accuracy(分类任务)、F1值(NER任务)的计算,数据处理等。 +- `evaluation_config.py`:模型和训练配置; +- `cluener_evaluation.py`:中文任务基准测评(Chinese Language Understanding Evaluation Benchmark)方法,未使用; +- `tokenization.py`:基于vocab.txt,将单词token化,未使用; +- `sample_process.py`:基于`tokenization.py`进行文本数据处理,未使用; +- `utils.py`:依赖微调时用的模型。 + +脚本传参、数据拷贝等代码参考预训练BERT中的解释。 + +#### 创建训练作业 + +可以参考[使用常用框架训练模型](https://support.huaweicloud.com/engineers-modelarts/modelarts_23_0238.html)来创建并启动训练作业。 + +创建训练作业的参考配置: + +- 算法来源:常用框架->Ascend-Powered-Engine->MindSpore +- 代码目录:选择上述新建的OBS桶中的experiment_4目录 +- 启动文件:选择上述新建的OBS桶中的experiment_4目录下的`fintune.py` +- 数据来源:数据存储位置->选择上述新建的OBS桶中的experiment_4文件夹下的tnews目录 +- 训练输出位置:选择上述新建的OBS桶中的experiment_4目录并在其中创建eval_output目录 +- 作业日志路径:同训练输出位置 +- 规格:Ascend:1*Ascend 910 +- 其他均为默认 + +启动并查看训练过程: + +1. 点击提交以开始训练; +2. 在训练作业列表里可以看到刚创建的训练作业,在训练作业页面可以看到版本管理; +3. 点击运行中的训练作业,在展开的窗口中可以查看作业配置信息,以及训练过程中的日志,日志会不断刷新,等训练作业完成后也可以下载日志到本地进行查看; +4. 在训练日志中可以看到`acc_num 5437 , total_num 10000, accuracy 0.543700`字段,即微调完成后的验证精度。 + +## 实验结论 + +本实验主要介绍使用MindSpore在zhiwiki数据集上预训练BERT,在TNEWS短文本分类数据集上进行微调,包括以下特性: + +- model_zoo:BERT +- BERT预训练 +- BERT微调 +- 不同的优化器 +- 文本数据集处理 diff --git a/experiment_4/CRF.py b/experiment_4/CRF.py new file mode 100644 index 0000000..02f117f --- /dev/null +++ b/experiment_4/CRF.py @@ -0,0 +1,177 @@ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +''' +CRF script. +''' + +import numpy as np +import mindspore.nn as nn +from mindspore.ops import operations as P +from mindspore.common.tensor import Tensor +from mindspore.common.parameter import Parameter +import mindspore.common.dtype as mstype + +class CRF(nn.Cell): + ''' + Conditional Random Field + Args: + tag_to_index: The dict for tag to index mapping with extra "" and ""sign. + batch_size: Batch size, i.e., the length of the first dimension. + seq_length: Sequence length, i.e., the length of the second dimention. + is_training: Specifies whether to use training mode. + Returns: + Training mode: Tensor, total loss. + Evaluation mode: Tuple, the index for each step with the highest score; Tuple, the index for the last + step with the highest score. + ''' + def __init__(self, tag_to_index, batch_size=1, seq_length=128, is_training=True): + + super(CRF, self).__init__() + self.target_size = len(tag_to_index) + self.is_training = is_training + self.tag_to_index = tag_to_index + self.batch_size = batch_size + self.seq_length = seq_length + self.START_TAG = "" + self.STOP_TAG = "" + self.START_VALUE = Tensor(self.target_size-2, dtype=mstype.int32) + self.STOP_VALUE = Tensor(self.target_size-1, dtype=mstype.int32) + transitions = np.random.normal(size=(self.target_size, self.target_size)).astype(np.float32) + transitions[tag_to_index[self.START_TAG], :] = -10000 + transitions[:, tag_to_index[self.STOP_TAG]] = -10000 + self.transitions = Parameter(Tensor(transitions), name="transition_matrix") + self.cat = P.Concat(axis=-1) + self.argmax = P.ArgMaxWithValue(axis=-1) + self.log = P.Log() + self.exp = P.Exp() + self.sum = P.ReduceSum() + self.tile = P.Tile() + self.reduce_sum = P.ReduceSum(keep_dims=True) + self.reshape = P.Reshape() + self.expand = P.ExpandDims() + self.mean = P.ReduceMean() + init_alphas = np.ones(shape=(self.batch_size, self.target_size)) * -10000.0 + init_alphas[:, self.tag_to_index[self.START_TAG]] = 0. + self.init_alphas = Tensor(init_alphas, dtype=mstype.float32) + self.cast = P.Cast() + self.reduce_max = P.ReduceMax(keep_dims=True) + self.on_value = Tensor(1.0, dtype=mstype.float32) + self.off_value = Tensor(0.0, dtype=mstype.float32) + self.onehot = P.OneHot() + + def log_sum_exp(self, logits): + ''' + Compute the log_sum_exp score for normalization factor. + ''' + max_score = self.reduce_max(logits, -1) #16 5 5 + score = self.log(self.reduce_sum(self.exp(logits - max_score), -1)) + score = max_score + score + return score + + def _realpath_score(self, features, label): + ''' + Compute the emission and transition score for the real path. + ''' + label = label * 1 + concat_A = self.tile(self.reshape(self.START_VALUE, (1,)), (self.batch_size,)) + concat_A = self.reshape(concat_A, (self.batch_size, 1)) + labels = self.cat((concat_A, label)) + onehot_label = self.onehot(label, self.target_size, self.on_value, self.off_value) + emits = features * onehot_label + labels = self.onehot(labels, self.target_size, self.on_value, self.off_value) + label1 = labels[:, 1:, :] + label2 = labels[:, :self.seq_length, :] + label1 = self.expand(label1, 3) + label2 = self.expand(label2, 2) + label_trans = label1 * label2 + transitions = self.expand(self.expand(self.transitions, 0), 0) + trans = transitions * label_trans + score = self.sum(emits, (1, 2)) + self.sum(trans, (1, 2, 3)) + stop_value_index = labels[:, (self.seq_length-1):self.seq_length, :] + stop_value = self.transitions[(self.target_size-1):self.target_size, :] + stop_score = stop_value * self.reshape(stop_value_index, (self.batch_size, self.target_size)) + score = score + self.sum(stop_score, 1) + score = self.reshape(score, (self.batch_size, -1)) + return score + + def _normalization_factor(self, features): + ''' + Compute the total score for all the paths. + ''' + forward_var = self.init_alphas + forward_var = self.expand(forward_var, 1) + for idx in range(self.seq_length): + feat = features[:, idx:(idx+1), :] + emit_score = self.reshape(feat, (self.batch_size, self.target_size, 1)) + next_tag_var = emit_score + self.transitions + forward_var + forward_var = self.log_sum_exp(next_tag_var) + forward_var = self.reshape(forward_var, (self.batch_size, 1, self.target_size)) + terminal_var = forward_var + self.reshape(self.transitions[(self.target_size-1):self.target_size, :], (1, -1)) + alpha = self.log_sum_exp(terminal_var) + alpha = self.reshape(alpha, (self.batch_size, -1)) + return alpha + + def _decoder(self, features): + ''' + Viterbi decode for evaluation. + ''' + backpointers = () + forward_var = self.init_alphas + for idx in range(self.seq_length): + feat = features[:, idx:(idx+1), :] + feat = self.reshape(feat, (self.batch_size, self.target_size)) + bptrs_t = () + + next_tag_var = self.expand(forward_var, 1) + self.transitions + best_tag_id, best_tag_value = self.argmax(next_tag_var) + bptrs_t += (best_tag_id,) + forward_var = best_tag_value + feat + + backpointers += (bptrs_t,) + terminal_var = forward_var + self.reshape(self.transitions[(self.target_size-1):self.target_size, :], (1, -1)) + best_tag_id, _ = self.argmax(terminal_var) + return backpointers, best_tag_id + + def construct(self, features, label): + if self.is_training: + forward_score = self._normalization_factor(features) + gold_score = self._realpath_score(features, label) + return_value = self.mean(forward_score - gold_score) + else: + path_list, tag = self._decoder(features) + return_value = path_list, tag + return return_value + +def postprocess(backpointers, best_tag_id): + ''' + Do postprocess + ''' + best_tag_id = best_tag_id.asnumpy() + batch_size = len(best_tag_id) + best_path = [] + for i in range(batch_size): + best_path.append([]) + best_local_id = best_tag_id[i] + best_path[-1].append(best_local_id) + for bptrs_t in reversed(backpointers): + bptrs_t = bptrs_t[0].asnumpy() + local_idx = bptrs_t[i] + best_local_id = local_idx[best_local_id] + best_path[-1].append(best_local_id) + # Pop off the start tag (we dont want to return that to the caller) + best_path[-1].pop() + best_path[-1].reverse() + return best_path diff --git a/experiment_4/cluener_evaluation.py b/experiment_4/cluener_evaluation.py new file mode 100644 index 0000000..67c2d28 --- /dev/null +++ b/experiment_4/cluener_evaluation.py @@ -0,0 +1,73 @@ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +'''bert clue evaluation''' + +import json +import numpy as np +from evaluation_config import cfg +import mindspore.common.dtype as mstype +from mindspore.common.tensor import Tensor +from CRF import postprocess +import tokenization +from sample_process import label_generation, process_one_example_p + +vocab_file = "tnews/vocab.txt" + +def process(model, text, sequence_length): + """ + process text. + """ + data = [text] + features = [] + res = [] + ids = [] + tokenizer_ = tokenization.FullTokenizer(vocab_file=vocab_file) + for i in data: + feature = process_one_example_p(tokenizer_, i, max_seq_len=sequence_length) + features.append(feature) + input_ids, input_mask, token_type_id = feature + input_ids = Tensor(np.array(input_ids), mstype.int32) + input_mask = Tensor(np.array(input_mask), mstype.int32) + token_type_id = Tensor(np.array(token_type_id), mstype.int32) + if cfg.use_crf: + backpointers, best_tag_id = model.predict(input_ids, input_mask, token_type_id, Tensor(1)) + best_path = postprocess(backpointers, best_tag_id) + logits = [] + for ele in best_path: + logits.extend(ele) + ids = logits + else: + logits = model.predict(input_ids, input_mask, token_type_id, Tensor(1)) + ids = logits.asnumpy() + ids = np.argmax(ids, axis=-1) + ids = list(ids) + res = label_generation(text, ids) + return res + +def submit(model, path, sequence_length): + """ + submit task + """ + data = [] + for line in open(path): + if not line.strip(): + continue + _ = json.loads(line.strip()) + res = process(model, _["text"], sequence_length) + print("_text", _["text"]) + print("res:", res) + data.append(json.dumps({"label": res}, ensure_ascii=False)) + open("ner_predict.json", "w").write("\n".join(data)) diff --git a/experiment_4/evaluation.py b/experiment_4/evaluation.py new file mode 100644 index 0000000..ab3eb3e --- /dev/null +++ b/experiment_4/evaluation.py @@ -0,0 +1,161 @@ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +""" +Bert evaluation script. +""" + +import os +os.environ['P_NUM'] = '16' +import numpy as np +from evaluation_config import cfg, bert_net_cfg +from utils import BertNER, BertCLS +import mindspore.common.dtype as mstype +from mindspore import context +from mindspore.common.tensor import Tensor +import mindspore.dataset as de +import mindspore.dataset.transforms.c_transforms as C +from mindspore.train.model import Model +from mindspore.train.serialization import load_checkpoint, load_param_into_net +from CRF import postprocess +from cluener_evaluation import submit +from finetune_config import tag_to_index + +class Accuracy(): + ''' + calculate accuracy + ''' + def __init__(self): + self.acc_num = 0 + self.total_num = 0 + def update(self, logits, labels): + labels = labels.asnumpy() + labels = np.reshape(labels, -1) + logits = logits.asnumpy() + logit_id = np.argmax(logits, axis=-1) + self.acc_num += np.sum(labels == logit_id) + self.total_num += len(labels) + print("=========================accuracy is ", self.acc_num / self.total_num) + +class F1(): + ''' + calculate F1 score + ''' + def __init__(self): + self.TP = 0 + self.FP = 0 + self.FN = 0 + def update(self, logits, labels): + ''' + update F1 score + ''' + labels = labels.asnumpy() + labels = np.reshape(labels, -1) + if cfg.use_crf: + backpointers, best_tag_id = logits + best_path = postprocess(backpointers, best_tag_id) + logit_id = [] + for ele in best_path: + logit_id.extend(ele) + else: + logits = logits.asnumpy() + logit_id = np.argmax(logits, axis=-1) + logit_id = np.reshape(logit_id, -1) + pos_eva = np.isin(logit_id, [i for i in range(1, cfg.num_labels)]) + pos_label = np.isin(labels, [i for i in range(1, cfg.num_labels)]) + self.TP += np.sum(pos_eva&pos_label) + self.FP += np.sum(pos_eva&(~pos_label)) + self.FN += np.sum((~pos_eva)&pos_label) + +def get_dataset(batch_size=1, repeat_count=1, distribute_file=''): + ''' + get dataset + ''' + ds = de.TFRecordDataset([cfg.data_file], cfg.schema_file, columns_list=["input_ids", "input_mask", + "segment_ids", "label_ids"]) + type_cast_op = C.TypeCast(mstype.int32) + ds = ds.map(input_columns="segment_ids", operations=type_cast_op) + ds = ds.map(input_columns="input_mask", operations=type_cast_op) + ds = ds.map(input_columns="input_ids", operations=type_cast_op) + ds = ds.map(input_columns="label_ids", operations=type_cast_op) + ds = ds.repeat(repeat_count) + + # apply shuffle operation + buffer_size = 960 + ds = ds.shuffle(buffer_size=buffer_size) + + # apply batch operations + ds = ds.batch(batch_size, drop_remainder=True) + return ds + +def bert_predict(Evaluation): + ''' + prediction function + ''' + devid = int(os.getenv('DEVICE_ID')) + context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", device_id=devid) + dataset = get_dataset(bert_net_cfg.batch_size, 1) + if cfg.use_crf: + net_for_pretraining = Evaluation(bert_net_cfg, False, num_labels=len(tag_to_index), use_crf=True, + tag_to_index=tag_to_index, dropout_prob=0.0) + else: + net_for_pretraining = Evaluation(bert_net_cfg, False, num_labels) + net_for_pretraining.set_train(False) + param_dict = load_checkpoint(cfg.finetune_ckpt) + load_param_into_net(net_for_pretraining, param_dict) + model = Model(net_for_pretraining) + return model, dataset + +def test_eval(): + ''' + evaluation function + ''' + task_type = BertNER if cfg.task == "NER" else BertCLS + model, dataset = bert_predict(task_type) + if cfg.clue_benchmark: + submit(model, cfg.data_file, bert_net_cfg.seq_length) + else: + callback = F1() if cfg.task == "NER" else Accuracy() + columns_list = ["input_ids", "input_mask", "segment_ids", "label_ids"] + for data in dataset.create_dict_iterator(): + input_data = [] + for i in columns_list: + input_data.append(Tensor(data[i])) + input_ids, input_mask, token_type_id, label_ids = input_data + logits = model.predict(input_ids, input_mask, token_type_id, label_ids) + callback.update(logits, label_ids) + print("==============================================================") + if cfg.task == "NER": + print("Precision {:.6f} ".format(callback.TP / (callback.TP + callback.FP))) + print("Recall {:.6f} ".format(callback.TP / (callback.TP + callback.FN))) + print("F1 {:.6f} ".format(2*callback.TP / (2*callback.TP + callback.FP + callback.FP))) + else: + print("acc_num {} , total_num {}, accuracy {:.6f}".format(callback.acc_num, callback.total_num, + callback.acc_num / callback.total_num)) + print("==============================================================") + +if __name__ == "__main__": + import argparse + parser = argparse.ArgumentParser() + parser.add_argument('--data_url', required=True, default=None, help='Location of data.') + parser.add_argument('--train_url', required=True, default=None, help='Location of training outputs.') + parser.add_argument('--num_epochs', type=int, default=1, help='Number of training epochs.') + args, unknown = parser.parse_known_args() + + import moxing as mox + mox.file.copy_parallel(src_url=args.data_url, dst_url='tnews/') + + num_labels = cfg.num_labels + test_eval() diff --git a/experiment_4/evaluation_config.py b/experiment_4/evaluation_config.py new file mode 100644 index 0000000..cc6b966 --- /dev/null +++ b/experiment_4/evaluation_config.py @@ -0,0 +1,53 @@ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +""" +config settings, will be used in finetune.py +""" + +from easydict import EasyDict as edict +import mindspore.common.dtype as mstype +from mindspore.model_zoo.Bert_NEZHA import BertConfig + +cfg = edict({ + 'task': 'classfication', + 'num_labels': 15, + 'data_file': 'tnews/dev.tf_record', + 'schema_file': 'tnews/dev_schema.json', + 'finetune_ckpt': 'tnews/bert_classfication-3_3335.ckpt', + 'use_crf': False, + 'clue_benchmark': False, +}) + +bert_net_cfg = BertConfig( + batch_size=16 if not cfg.clue_benchmark else 1, + seq_length=128, + vocab_size=21128, + hidden_size=768, + num_hidden_layers=12, + num_attention_heads=12, + intermediate_size=3072, + hidden_act="gelu", + hidden_dropout_prob=0.0, + attention_probs_dropout_prob=0.0, + max_position_embeddings=512, + type_vocab_size=2, + initializer_range=0.02, + use_relative_positions=False, + input_mask_from_dataset=True, + token_type_ids_from_dataset=True, + dtype=mstype.float32, + compute_type=mstype.float16, +) diff --git a/experiment_4/finetune.py b/experiment_4/finetune.py new file mode 100644 index 0000000..6397aee --- /dev/null +++ b/experiment_4/finetune.py @@ -0,0 +1,152 @@ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +''' +Bert finetune script. +''' + +import os +os.environ['P_NUM'] = '16' +from utils import BertFinetuneCell, BertCLS, BertNER +from finetune_config import cfg, bert_net_cfg, tag_to_index +import mindspore.common.dtype as mstype +import mindspore.communication.management as D +from mindspore import context +import mindspore.dataset as de +import mindspore.dataset.transforms.c_transforms as C +from mindspore.nn.wrap.loss_scale import DynamicLossScaleUpdateCell +from mindspore.nn.optim import AdamWeightDecay, AdamWeightDecayDynamicLR, Lamb, Momentum +from mindspore.train.model import Model +from mindspore.train.callback import Callback +from mindspore.train.callback import CheckpointConfig, ModelCheckpoint +from mindspore.train.serialization import load_checkpoint, load_param_into_net + +class LossCallBack(Callback): + ''' + Monitor the loss in training. + If the loss is NAN or INF, terminate training. + Note: + If per_print_times is 0, do not print loss. + Args: + per_print_times (int): Print loss every times. Default: 1. + ''' + def __init__(self, per_print_times=1): + super(LossCallBack, self).__init__() + if not isinstance(per_print_times, int) or per_print_times < 0: + raise ValueError("print_step must be in and >= 0.") + self._per_print_times = per_print_times + + def step_end(self, run_context): + cb_params = run_context.original_args() + print("epoch: {}, step: {}, outputs are {}".format(cb_params.cur_epoch_num, cb_params.cur_step_num, + str(cb_params.net_outputs))) + + +def get_dataset(batch_size=1, repeat_count=1, distribute_file=''): + ''' + get dataset + ''' + ds = de.TFRecordDataset([cfg.data_file], cfg.schema_file, columns_list=["input_ids", "input_mask", + "segment_ids", "label_ids"]) + type_cast_op = C.TypeCast(mstype.int32) + ds = ds.map(input_columns="segment_ids", operations=type_cast_op) + ds = ds.map(input_columns="input_mask", operations=type_cast_op) + ds = ds.map(input_columns="input_ids", operations=type_cast_op) + ds = ds.map(input_columns="label_ids", operations=type_cast_op) + ds = ds.repeat(repeat_count) + + # apply shuffle operation + buffer_size = 960 + ds = ds.shuffle(buffer_size=buffer_size) + + # apply batch operations + ds = ds.batch(batch_size, drop_remainder=True) + return ds + +def test_train(): + ''' + finetune function + ''' + devid = int(os.getenv('DEVICE_ID')) + context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", device_id=devid, + enable_mem_reuse=True, enable_task_sink=True) + #BertCLSTrain for classification + #BertNERTrain for sequence labeling + if cfg.task == 'NER': + if cfg.use_crf: + netwithloss = BertNER(bert_net_cfg, True, num_labels=len(tag_to_index), use_crf=True, + tag_to_index=tag_to_index, dropout_prob=0.1) + else: + netwithloss = BertNER(bert_net_cfg, True, num_labels=cfg.num_labels, dropout_prob=0.1) + else: + netwithloss = BertCLS(bert_net_cfg, True, num_labels=cfg.num_labels, dropout_prob=0.1) + dataset = get_dataset(bert_net_cfg.batch_size, cfg.epoch_num) + # optimizer + steps_per_epoch = dataset.get_dataset_size() + if cfg.optimizer == 'AdamWeightDecayDynamicLR': + optimizer = AdamWeightDecayDynamicLR(netwithloss.trainable_params(), + decay_steps=steps_per_epoch * cfg.epoch_num, + learning_rate=cfg.AdamWeightDecayDynamicLR.learning_rate, + end_learning_rate=cfg.AdamWeightDecayDynamicLR.end_learning_rate, + power=cfg.AdamWeightDecayDynamicLR.power, + #warmup_steps=steps_per_epoch, + weight_decay=cfg.AdamWeightDecayDynamicLR.weight_decay, + eps=cfg.AdamWeightDecayDynamicLR.eps) + #decay_filter=lambda x: 'LayerNorm' not in x.name and + # 'bias' not in x.name and + # 'layernorm' not in x.name) + elif cfg.optimizer == 'AdamWeightDecay': + optimizer = AdamWeightDecay(netwithloss.trainable_params(), + learning_rate=cfg.AdamWeightDecay.learning_rate, + weight_decay=cfg.AdamWeightDecay.weight_decay, + eps=cfg.AdamWeightDecay.eps, + decay_filter=lambda x: 'LayerNorm' not in x.name and + 'bias' not in x.name and + 'layernorm' not in x.name) + elif cfg.optimizer == 'Lamb': + optimizer = Lamb(netwithloss.trainable_params(), decay_steps=steps_per_epoch * cfg.epoch_num, + start_learning_rate=cfg.Lamb.start_learning_rate, end_learning_rate=cfg.Lamb.end_learning_rate, + power=cfg.Lamb.power, warmup_steps=steps_per_epoch, decay_filter=cfg.Lamb.decay_filter) + elif cfg.optimizer == 'Momentum': + optimizer = Momentum(netwithloss.trainable_params(), learning_rate=cfg.Momentum.learning_rate, + momentum=cfg.Momentum.momentum) + else: + raise Exception("Optimizer not supported.") + print("check steps, steps_per_epoch: ", steps_per_epoch) + # load checkpoint into network + ckpt_config = CheckpointConfig(save_checkpoint_steps=steps_per_epoch, keep_checkpoint_max=1) + ckpoint_cb = ModelCheckpoint(prefix=cfg.ckpt_prefix, directory=cfg.ckpt_dir, config=ckpt_config) + param_dict = load_checkpoint(cfg.pre_training_ckpt) + load_param_into_net(netwithloss, param_dict) + + update_cell = DynamicLossScaleUpdateCell(loss_scale_value=2**32, scale_factor=2, scale_window=1000) + netwithgrads = BertFinetuneCell(netwithloss, optimizer=optimizer, scale_update_cell=update_cell) + model = Model(netwithgrads) + model.train(cfg.epoch_num, dataset, callbacks=[LossCallBack(), ckpoint_cb]) + +if __name__ == "__main__": + import argparse + parser = argparse.ArgumentParser() + parser.add_argument('--data_url', required=True, default=None, help='Location of data.') + parser.add_argument('--train_url', required=True, default=None, help='Location of training outputs.') + parser.add_argument('--num_epochs', type=int, default=1, help='Number of training epochs.') + args, unknown = parser.parse_known_args() + + import moxing as mox + mox.file.copy_parallel(src_url=args.data_url, dst_url='tnews/') + + test_train() + mox.file.copy_parallel(src_url='bert_classfication-3_3335.ckpt', + dst_url=os.path.join(args.data_url, 'bert_classfication-3_3335.ckpt')) diff --git a/experiment_4/finetune_config.py b/experiment_4/finetune_config.py new file mode 100644 index 0000000..616b93a --- /dev/null +++ b/experiment_4/finetune_config.py @@ -0,0 +1,124 @@ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +""" +config settings, will be used in finetune.py +""" + +from easydict import EasyDict as edict +import mindspore.common.dtype as mstype +from mindspore.model_zoo.Bert_NEZHA import BertConfig + +cfg = edict({ + 'task': 'nothing', + 'num_labels':15, + 'data_file': 'tnews/train.tf_record', + 'schema_file': 'tnews/train_schema.json', + 'epoch_num': 3, + 'ckpt_prefix': 'bert_classfication', + 'ckpt_dir': None, + 'pre_training_ckpt': 'tnews/bert_base.ckpt', + 'use_crf': False, + 'optimizer': 'AdamWeightDecayDynamicLR', + 'AdamWeightDecay': edict({ + 'learning_rate': 2e-5, + 'weight_decay': 1e-5, + 'eps': 1e-6, + }), + 'AdamWeightDecayDynamicLR': edict({ + 'learning_rate': 2e-5, + 'end_learning_rate': 1e-7, + 'power': 1.0, + 'weight_decay': 1e-5, + 'eps': 1e-6, + }), + 'Lamb': edict({ + 'start_learning_rate': 2e-5, + 'end_learning_rate': 1e-7, + 'power': 1.0, + 'decay_filter': lambda x: False, + }), + 'Momentum': edict({ + 'learning_rate': 2e-5, + 'momentum': 0.9, + }), +}) + +bert_net_cfg = BertConfig( + batch_size=16, + seq_length=128, + vocab_size=21128, + hidden_size=768, + num_hidden_layers=12, + num_attention_heads=12, + intermediate_size=3072, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=2, + initializer_range=0.02, + use_relative_positions=False, + input_mask_from_dataset=True, + token_type_ids_from_dataset=True, + dtype=mstype.float32, + compute_type=mstype.float16, +) + +tag_to_index = { + "O": 0, + "S_address": 1, + "B_address": 2, + "M_address": 3, + "E_address": 4, + "S_book": 5, + "B_book": 6, + "M_book": 7, + "E_book": 8, + "S_company": 9, + "B_company": 10, + "M_company": 11, + "E_company": 12, + "S_game": 13, + "B_game": 14, + "M_game": 15, + "E_game": 16, + "S_government": 17, + "B_government": 18, + "M_government": 19, + "E_government": 20, + "S_movie": 21, + "B_movie": 22, + "M_movie": 23, + "E_movie": 24, + "S_name": 25, + "B_name": 26, + "M_name": 27, + "E_name": 28, + "S_organization": 29, + "B_organization": 30, + "M_organization": 31, + "E_organization": 32, + "S_position": 33, + "B_position": 34, + "M_position": 35, + "E_position": 36, + "S_scene": 37, + "B_scene": 38, + "M_scene": 39, + "E_scene": 40, + "": 41, + "": 42 +} diff --git a/experiment_4/pretrain.py b/experiment_4/pretrain.py new file mode 100644 index 0000000..7e6796e --- /dev/null +++ b/experiment_4/pretrain.py @@ -0,0 +1,167 @@ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +"""train bert network without lossscale""" + +import os +import numpy as np +from numpy import allclose +import mindspore.common.dtype as mstype +import mindspore.dataset.engine.datasets as de +import mindspore.dataset.transforms.c_transforms as C +from mindspore import context +from mindspore.common.tensor import Tensor +from mindspore.train.model import Model +from mindspore.train.callback import Callback, LossMonitor +from mindspore.train.loss_scale_manager import DynamicLossScaleManager +from mindspore.model_zoo.Bert_NEZHA import BertConfig, BertNetworkWithLoss, BertTrainOneStepWithLossScaleCell +from mindspore.nn.optim import Momentum +from mindspore import log as logger + + +DATA_DIR = ["zhwiki_part/part.tfrecord"] +SCHEMA_DIR = "zhwiki_part/schema.json" + + +def get_config(version='base', batch_size=1): + """get config""" + if version == 'base': + bert_config = BertConfig( + batch_size=batch_size, + seq_length=128, + vocab_size=21136, + hidden_size=768, + num_hidden_layers=12, + num_attention_heads=12, + intermediate_size=3072, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=2, + initializer_range=0.02, + use_relative_positions=True, + input_mask_from_dataset=True, + token_type_ids_from_dataset=True, + dtype=mstype.float32, + compute_type=mstype.float32) + elif version == 'large': + bert_config = BertConfig( + batch_size=batch_size, + seq_length=128, + vocab_size=21136, + hidden_size=1024, + num_hidden_layers=12, + num_attention_heads=16, + intermediate_size=4096, + hidden_act="gelu", + hidden_dropout_prob=0.0, + attention_probs_dropout_prob=0.0, + max_position_embeddings=512, + type_vocab_size=2, + initializer_range=0.02, + use_relative_positions=True, + input_mask_from_dataset=True, + token_type_ids_from_dataset=True, + dtype=mstype.float32, + compute_type=mstype.float16) + elif version == 'large_mixed': + bert_config = BertConfig( + batch_size=batch_size, + seq_length=128, + vocab_size=21136, + hidden_size=1024, + num_hidden_layers=24, + num_attention_heads=16, + intermediate_size=4096, + hidden_act="gelu", + hidden_dropout_prob=0.0, + attention_probs_dropout_prob=0.0, + max_position_embeddings=512, + type_vocab_size=2, + initializer_range=0.02, + use_relative_positions=True, + input_mask_from_dataset=True, + token_type_ids_from_dataset=True, + dtype=mstype.float32, + compute_type=mstype.float32) + else: + bert_config = BertConfig(batch_size=batch_size) + return bert_config + +def create_dataset(): + """test me de train dataset""" + # apply repeat operations + repeat_count = args.num_epochs + ds = de.TFRecordDataset(DATA_DIR, SCHEMA_DIR, columns_list=["input_ids", "input_mask", "segment_ids", + "next_sentence_labels", "masked_lm_positions", + "masked_lm_ids", "masked_lm_weights"], shuffle=False) + type_cast_op = C.TypeCast(mstype.int32) + ds = ds.map(input_columns="masked_lm_ids", operations=type_cast_op) + ds = ds.map(input_columns="masked_lm_positions", operations=type_cast_op) + ds = ds.map(input_columns="next_sentence_labels", operations=type_cast_op) + ds = ds.map(input_columns="segment_ids", operations=type_cast_op) + ds = ds.map(input_columns="input_mask", operations=type_cast_op) + ds = ds.map(input_columns="input_ids", operations=type_cast_op) + # apply batch operations + batch_size = int(os.getenv('BATCH_SIZE', '16')) + ds = ds.batch(batch_size, drop_remainder=True) + ds = ds.repeat(repeat_count) + return ds + + +class ModelCallback(Callback): + def __init__(self): + super(ModelCallback, self).__init__() + + def step_end(self, run_context): + cb_params = run_context.original_args() + print("epoch: {}, outputs are: {}".format(cb_params.cur_epoch_num, str(cb_params.net_outputs))) + + +def test_bert_tdt(): + """test bert tdt""" + context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", reserve_class_name_in_scope=False) + context.set_context(enable_task_sink=True) + # context.set_context(enable_loop_sink=True) + context.set_context(enable_mem_reuse=True) + ds = create_dataset() + version = os.getenv('VERSION', 'base') + batch_size = int(os.getenv('BATCH_SIZE', '16')) + config = get_config(version=version, batch_size=batch_size) + netwithloss = BertNetworkWithLoss(config, True) + optimizer = Momentum(netwithloss.trainable_params(), learning_rate=2e-5, momentum=0.9) + scale_window = 3 + scale_manager = DynamicLossScaleManager(2**32, 2, scale_window) + netwithgrads = BertTrainOneStepWithLossScaleCell(netwithloss, optimizer=optimizer, scale_update_cell=scale_manager.get_update_cell()) + netwithgrads.set_train(True) + model = Model(netwithgrads) + callback = ModelCallback() + # loss_cb = LossMonitor(per_print_times=ds.get_dataset_size()) + model.train(ds.get_repeat_count(), ds, callbacks=callback) + + +if __name__ == '__main__': + import argparse + parser = argparse.ArgumentParser() + parser.add_argument('--data_url', required=True, default=None, help='Location of data.') + parser.add_argument('--train_url', required=True, default=None, help='Location of training outputs.') + parser.add_argument('--num_epochs', type=int, default=50, help='Number of training epochs.') + args, unknown = parser.parse_known_args() + + import moxing as mox + mox.file.copy_parallel(src_url=args.data_url, dst_url='zhwiki_part/') + + test_bert_tdt() diff --git a/experiment_4/sample_process.py b/experiment_4/sample_process.py new file mode 100644 index 0000000..7a7752e --- /dev/null +++ b/experiment_4/sample_process.py @@ -0,0 +1,100 @@ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +"""process txt""" + +import re +import json + +def process_one_example_p(tokenizer, text, max_seq_len=128): + """process one testline""" + textlist = list(text) + tokens = [] + for _, word in enumerate(textlist): + token = tokenizer.tokenize(word) + tokens.extend(token) + if len(tokens) >= max_seq_len - 1: + tokens = tokens[0:(max_seq_len - 2)] + ntokens = [] + segment_ids = [] + label_ids = [] + ntokens.append("[CLS]") + segment_ids.append(0) + for _, token in enumerate(tokens): + ntokens.append(token) + segment_ids.append(0) + ntokens.append("[SEP]") + segment_ids.append(0) + input_ids = tokenizer.convert_tokens_to_ids(ntokens) + input_mask = [1] * len(input_ids) + while len(input_ids) < max_seq_len: + input_ids.append(0) + input_mask.append(0) + segment_ids.append(0) + label_ids.append(0) + ntokens.append("**NULL**") + assert len(input_ids) == max_seq_len + assert len(input_mask) == max_seq_len + assert len(segment_ids) == max_seq_len + + feature = (input_ids, input_mask, segment_ids) + return feature + +def label_generation(text, probs): + """generate label""" + data = [text] + probs = [probs] + result = [] + label2id = json.loads(open("tnews/label2id.json").read()) + id2label = [k for k, v in label2id.items()] + + for index, prob in enumerate(probs): + for v in prob[1:len(data[index]) + 1]: + result.append(id2label[int(v)]) + + labels = {} + start = None + index = 0 + for _, t in zip("".join(data), result): + if re.search("^[BS]", t): + if start is not None: + label = result[index - 1][2:] + if labels.get(label): + te_ = text[start:index] + labels[label][te_] = [[start, index - 1]] + else: + te_ = text[start:index] + labels[label] = {te_: [[start, index - 1]]} + start = index + if re.search("^O", t): + if start is not None: + label = result[index - 1][2:] + if labels.get(label): + te_ = text[start:index] + labels[label][te_] = [[start, index - 1]] + else: + te_ = text[start:index] + labels[label] = {te_: [[start, index - 1]]} + start = None + index += 1 + if start is not None: + label = result[start][2:] + if labels.get(label): + te_ = text[start:index] + labels[label][te_] = [[start, index - 1]] + else: + te_ = text[start:index] + labels[label] = {te_: [[start, index - 1]]} + return labels diff --git a/experiment_4/tokenization.py b/experiment_4/tokenization.py new file mode 100644 index 0000000..edc9abf --- /dev/null +++ b/experiment_4/tokenization.py @@ -0,0 +1,388 @@ +"""Tokenization classes.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import collections +import re +import unicodedata +import six +#import tensorflow as tf + + +def validate_case_matches_checkpoint(do_lower_case, init_checkpoint): + """Checks whether the casing config is consistent with the checkpoint name.""" + + # The casing has to be passed in by the user and there is no explicit check + # as to whether it matches the checkpoint. The casing information probably + # should have been stored in the bert_config.json file, but it's not, so + # we have to heuristically detect it to validate. + + if not init_checkpoint: + return + + m = re.match("^.*?([A-Za-z0-9_-]+)/bert_model.ckpt", init_checkpoint) + if m is None: + return + + model_name = m.group(1) + + lower_models = [ + "uncased_L-24_H-1024_A-16", "uncased_L-12_H-768_A-12", + "multilingual_L-12_H-768_A-12", "chinese_L-12_H-768_A-12" + ] + + cased_models = [ + "cased_L-12_H-768_A-12", "cased_L-24_H-1024_A-16", + "multi_cased_L-12_H-768_A-12" + ] + + is_bad_config = False + if model_name in lower_models and not do_lower_case: + is_bad_config = True + actual_flag = "False" + case_name = "lowercased" + opposite_flag = "True" + + if model_name in cased_models and do_lower_case: + is_bad_config = True + actual_flag = "True" + case_name = "cased" + opposite_flag = "False" + + if is_bad_config: + raise ValueError( + "You passed in `--do_lower_case=%s` with `--init_checkpoint=%s`. " + "However, `%s` seems to be a %s model, so you " + "should pass in `--do_lower_case=%s` so that the fine-tuning matches " + "how the model was pre-training. If this error is wrong, please " + "just comment out this check." % (actual_flag, init_checkpoint, + model_name, case_name, opposite_flag)) + + +def convert_to_unicode(text): + """Converts `text` to Unicode (if it's not already), assuming utf-8 input.""" + if six.PY3: + if isinstance(text, str): + return text + elif isinstance(text, bytes): + return text.decode("utf-8", "ignore") + else: + raise ValueError("Unsupported string type: %s" % (type(text))) + elif six.PY2: + if isinstance(text, str): + return text.decode("utf-8", "ignore") + elif isinstance(text, unicode): + return text + else: + raise ValueError("Unsupported string type: %s" % (type(text))) + else: + raise ValueError("Not running on Python2 or Python 3?") + + +def printable_text(text): + """Returns text encoded in a way suitable for print or `tf.logging`.""" + + # These functions want `str` for both Python2 and Python3, but in one case + # it's a Unicode string and in the other it's a byte string. + if six.PY3: + if isinstance(text, str): + return text + elif isinstance(text, bytes): + return text.decode("utf-8", "ignore") + else: + raise ValueError("Unsupported string type: %s" % (type(text))) + elif six.PY2: + if isinstance(text, str): + return text + elif isinstance(text, unicode): + return text.encode("utf-8") + else: + raise ValueError("Unsupported string type: %s" % (type(text))) + else: + raise ValueError("Not running on Python2 or Python 3?") + + +def load_vocab(vocab_file): + """Loads a vocabulary file into a dictionary.""" + vocab = collections.OrderedDict() + index = 0 + with open(vocab_file, "r") as reader: + while True: + token = convert_to_unicode(reader.readline()) + if not token: + break + token = token.strip() + vocab[token] = index + index += 1 + return vocab + + +def convert_by_vocab(vocab, items): + """Converts a sequence of [tokens|ids] using the vocab.""" + output = [] + for item in items: + if item in vocab: + output.append(vocab[item]) + else: + output.append(vocab['[UNK]']) + return output + + +def convert_tokens_to_ids(vocab, tokens): + return convert_by_vocab(vocab, tokens) + + +def convert_ids_to_tokens(inv_vocab, ids): + return convert_by_vocab(inv_vocab, ids) + + +def whitespace_tokenize(text): + """Runs basic whitespace cleaning and splitting on a piece of text.""" + text = text.strip() + if not text: + return [] + tokens = text.split() + return tokens + + +class FullTokenizer(object): + """Runs end-to-end tokenziation.""" + + def __init__(self, vocab_file, do_lower_case=True): + self.vocab = load_vocab(vocab_file) + self.inv_vocab = {v: k for k, v in self.vocab.items()} + self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case) + self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab) + + def tokenize(self, text): + split_tokens = [] + for token in self.basic_tokenizer.tokenize(text): + for sub_token in self.wordpiece_tokenizer.tokenize(token): + split_tokens.append(sub_token) + + return split_tokens + + def convert_tokens_to_ids(self, tokens): + return convert_by_vocab(self.vocab, tokens) + + def convert_ids_to_tokens(self, ids): + return convert_by_vocab(self.inv_vocab, ids) + + +class BasicTokenizer(object): + """Runs basic tokenization (punctuation splitting, lower casing, etc.).""" + + def __init__(self, do_lower_case=True): + """Constructs a BasicTokenizer. + + Args: + do_lower_case: Whether to lower case the input. + """ + self.do_lower_case = do_lower_case + + def tokenize(self, text): + """Tokenizes a piece of text.""" + text = convert_to_unicode(text) + text = self._clean_text(text) + + # This was added on November 1st, 2018 for the multilingual and Chinese + # models. This is also applied to the English models now, but it doesn't + # matter since the English models were not trained on any Chinese data + # and generally don't have any Chinese data in them (there are Chinese + # characters in the vocabulary because Wikipedia does have some Chinese + # words in the English Wikipedia.). + text = self._tokenize_chinese_chars(text) + + orig_tokens = whitespace_tokenize(text) + split_tokens = [] + for token in orig_tokens: + if self.do_lower_case: + token = token.lower() + token = self._run_strip_accents(token) + split_tokens.extend(self._run_split_on_punc(token)) + + output_tokens = whitespace_tokenize(" ".join(split_tokens)) + return output_tokens + + def _run_strip_accents(self, text): + """Strips accents from a piece of text.""" + text = unicodedata.normalize("NFD", text) + output = [] + for char in text: + cat = unicodedata.category(char) + if cat == "Mn": + continue + output.append(char) + return "".join(output) + + def _run_split_on_punc(self, text): + """Splits punctuation on a piece of text.""" + chars = list(text) + i = 0 + start_new_word = True + output = [] + while i < len(chars): + char = chars[i] + if _is_punctuation(char): + output.append([char]) + start_new_word = True + else: + if start_new_word: + output.append([]) + start_new_word = False + output[-1].append(char) + i += 1 + + return ["".join(x) for x in output] + + def _tokenize_chinese_chars(self, text): + """Adds whitespace around any CJK character.""" + output = [] + for char in text: + cp = ord(char) + if self._is_chinese_char(cp): + output.append(" ") + output.append(char) + output.append(" ") + else: + output.append(char) + return "".join(output) + + def _is_chinese_char(self, cp): + """Checks whether CP is the codepoint of a CJK character.""" + # This defines a "chinese character" as anything in the CJK Unicode block: + # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block) + # + # Note that the CJK Unicode block is NOT all Japanese and Korean characters, + # despite its name. The modern Korean Hangul alphabet is a different block, + # as is Japanese Hiragana and Katakana. Those alphabets are used to write + # space-separated words, so they are not treated specially and handled + # like the all of the other languages. + if ((cp >= 0x4E00 and cp <= 0x9FFF) or # + (cp >= 0x3400 and cp <= 0x4DBF) or # + (cp >= 0x20000 and cp <= 0x2A6DF) or # + (cp >= 0x2A700 and cp <= 0x2B73F) or # + (cp >= 0x2B740 and cp <= 0x2B81F) or # + (cp >= 0x2B820 and cp <= 0x2CEAF) or + (cp >= 0xF900 and cp <= 0xFAFF) or # + (cp >= 0x2F800 and cp <= 0x2FA1F)): # + return True + + return False + + def _clean_text(self, text): + """Performs invalid character removal and whitespace cleanup on text.""" + output = [] + for char in text: + cp = ord(char) + if cp == 0 or cp == 0xfffd or _is_control(char): + continue + if _is_whitespace(char): + output.append(" ") + else: + output.append(char) + return "".join(output) + + +class WordpieceTokenizer(object): + """Runs WordPiece tokenziation.""" + + def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=200): + self.vocab = vocab + self.unk_token = unk_token + self.max_input_chars_per_word = max_input_chars_per_word + + def tokenize(self, text): + """Tokenizes a piece of text into its word pieces. + + This uses a greedy longest-match-first algorithm to perform tokenization + using the given vocabulary. + + For example: + input = "unaffable" + output = ["un", "##aff", "##able"] + + Args: + text: A single token or whitespace separated tokens. This should have + already been passed through `BasicTokenizer. + + Returns: + A list of wordpiece tokens. + """ + + text = convert_to_unicode(text) + + output_tokens = [] + for token in whitespace_tokenize(text): + chars = list(token) + if len(chars) > self.max_input_chars_per_word: + output_tokens.append(self.unk_token) + continue + + is_bad = False + start = 0 + sub_tokens = [] + while start < len(chars): + end = len(chars) + cur_substr = None + while start < end: + substr = "".join(chars[start:end]) + if start > 0: + substr = "##" + substr + if substr in self.vocab: + cur_substr = substr + break + end -= 1 + if cur_substr is None: + is_bad = True + break + sub_tokens.append(cur_substr) + start = end + + if is_bad: + output_tokens.append(self.unk_token) + else: + output_tokens.extend(sub_tokens) + return output_tokens + + +def _is_whitespace(char): + """Checks whether `chars` is a whitespace character.""" + # \t, \n, and \r are technically contorl characters but we treat them + # as whitespace since they are generally considered as such. + if char == " " or char == "\t" or char == "\n" or char == "\r": + return True + cat = unicodedata.category(char) + if cat == "Zs": + return True + return False + + +def _is_control(char): + """Checks whether `chars` is a control character.""" + # These are technically control characters but we count them as whitespace + # characters. + if char == "\t" or char == "\n" or char == "\r": + return False + cat = unicodedata.category(char) + if cat.startswith("C"): + return True + return False + + +def _is_punctuation(char): + """Checks whether `chars` is a punctuation character.""" + cp = ord(char) + # We treat all non-letter/number ASCII as punctuation. + # Characters such as "^", "$", and "`" are not in the Unicode + # Punctuation class but we treat them as punctuation anyways, for + # consistency. + if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or + (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)): + return True + cat = unicodedata.category(char) + if cat.startswith("P"): + return True + return False diff --git a/experiment_4/utils.py b/experiment_4/utils.py new file mode 100644 index 0000000..ceae53f --- /dev/null +++ b/experiment_4/utils.py @@ -0,0 +1,263 @@ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +''' +Functional Cells used in Bert finetune and evaluation. +''' + +import mindspore.nn as nn +from mindspore.common.initializer import TruncatedNormal +from mindspore.ops import operations as P +from mindspore.ops import functional as F +from mindspore.ops import composite as C +from mindspore.common.tensor import Tensor +from mindspore.common.parameter import Parameter, ParameterTuple +from mindspore.common import dtype as mstype +from mindspore.nn.wrap.grad_reducer import DistributedGradReducer +from mindspore.train.parallel_utils import ParallelMode +from mindspore.communication.management import get_group_size +from mindspore import context +from mindspore.model_zoo.Bert_NEZHA.bert_model import BertModel +from mindspore.model_zoo.Bert_NEZHA.bert_for_pre_training import ClipGradients +from CRF import CRF + +GRADIENT_CLIP_TYPE = 1 +GRADIENT_CLIP_VALUE = 1.0 +grad_scale = C.MultitypeFuncGraph("grad_scale") +reciprocal = P.Reciprocal() + +@grad_scale.register("Tensor", "Tensor") +def tensor_grad_scale(scale, grad): + return grad * reciprocal(scale) + +class BertFinetuneCell(nn.Cell): + """ + Especifically defined for finetuning where only four inputs tensor are needed. + """ + def __init__(self, network, optimizer, scale_update_cell=None): + + super(BertFinetuneCell, self).__init__(auto_prefix=False) + self.network = network + self.weights = ParameterTuple(network.trainable_params()) + self.optimizer = optimizer + self.grad = C.GradOperation('grad', + get_by_list=True, + sens_param=True) + self.reducer_flag = False + self.allreduce = P.AllReduce() + self.parallel_mode = context.get_auto_parallel_context("parallel_mode") + if self.parallel_mode in [ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL]: + self.reducer_flag = True + self.grad_reducer = None + if self.reducer_flag: + mean = context.get_auto_parallel_context("mirror_mean") + degree = get_group_size() + self.grad_reducer = DistributedGradReducer(optimizer.parameters, mean, degree) + self.is_distributed = (self.parallel_mode != ParallelMode.STAND_ALONE) + self.clip_gradients = ClipGradients() + self.cast = P.Cast() + self.alloc_status = P.NPUAllocFloatStatus() + self.get_status = P.NPUGetFloatStatus() + self.clear_before_grad = P.NPUClearFloatStatus() + self.reduce_sum = P.ReduceSum(keep_dims=False) + self.depend_parameter_use = P.ControlDepend(depend_mode=1) + self.base = Tensor(1, mstype.float32) + self.less_equal = P.LessEqual() + self.hyper_map = C.HyperMap() + self.loss_scale = None + self.loss_scaling_manager = scale_update_cell + if scale_update_cell: + self.loss_scale = Parameter(Tensor(scale_update_cell.get_loss_scale(), dtype=mstype.float32), + name="loss_scale") + + def construct(self, + input_ids, + input_mask, + token_type_id, + label_ids, + sens=None): + + weights = self.weights + init = self.alloc_status() + loss = self.network(input_ids, + input_mask, + token_type_id, + label_ids) + if sens is None: + scaling_sens = self.loss_scale + else: + scaling_sens = sens + grads = self.grad(self.network, weights)(input_ids, + input_mask, + token_type_id, + label_ids, + self.cast(scaling_sens, + mstype.float32)) + clear_before_grad = self.clear_before_grad(init) + F.control_depend(loss, init) + self.depend_parameter_use(clear_before_grad, scaling_sens) + grads = self.hyper_map(F.partial(grad_scale, scaling_sens), grads) + grads = self.clip_gradients(grads, GRADIENT_CLIP_TYPE, GRADIENT_CLIP_VALUE) + if self.reducer_flag: + grads = self.grad_reducer(grads) + flag = self.get_status(init) + flag_sum = self.reduce_sum(init, (0,)) + if self.is_distributed: + flag_reduce = self.allreduce(flag_sum) + cond = self.less_equal(self.base, flag_reduce) + else: + cond = self.less_equal(self.base, flag_sum) + F.control_depend(grads, flag) + F.control_depend(flag, flag_sum) + overflow = cond + if sens is None: + overflow = self.loss_scaling_manager(self.loss_scale, cond) + if overflow: + succ = False + else: + succ = self.optimizer(grads) + ret = (loss, cond) + return F.depend(ret, succ) + +class BertCLSModel(nn.Cell): + """ + This class is responsible for classification task evaluation, i.e. XNLI(num_labels=3), + LCQMC(num_labels=2), Chnsenti(num_labels=2). The returned output represents the final + logits as the results of log_softmax is propotional to that of softmax. + """ + def __init__(self, config, is_training, num_labels=2, dropout_prob=0.0, use_one_hot_embeddings=False): + super(BertCLSModel, self).__init__() + self.bert = BertModel(config, is_training, use_one_hot_embeddings) + self.cast = P.Cast() + self.weight_init = TruncatedNormal(config.initializer_range) + self.log_softmax = P.LogSoftmax(axis=-1) + self.dtype = config.dtype + self.num_labels = num_labels + self.dense_1 = nn.Dense(config.hidden_size, self.num_labels, weight_init=self.weight_init, + has_bias=True).to_float(config.compute_type) + self.dropout = nn.Dropout(1 - dropout_prob) + + def construct(self, input_ids, input_mask, token_type_id): + _, pooled_output, _ = \ + self.bert(input_ids, token_type_id, input_mask) + cls = self.cast(pooled_output, self.dtype) + cls = self.dropout(cls) + logits = self.dense_1(cls) + logits = self.cast(logits, self.dtype) + log_probs = self.log_softmax(logits) + return log_probs + + +class BertNERModel(nn.Cell): + """ + This class is responsible for sequence labeling task evaluation, i.e. NER(num_labels=11). + The returned output represents the final logits as the results of log_softmax is propotional to that of softmax. + """ + def __init__(self, config, is_training, num_labels=11, use_crf=False, dropout_prob=0.0, + use_one_hot_embeddings=False): + super(BertNERModel, self).__init__() + self.bert = BertModel(config, is_training, use_one_hot_embeddings) + self.cast = P.Cast() + self.weight_init = TruncatedNormal(config.initializer_range) + self.log_softmax = P.LogSoftmax(axis=-1) + self.dtype = config.dtype + self.num_labels = num_labels + self.dense_1 = nn.Dense(config.hidden_size, self.num_labels, weight_init=self.weight_init, + has_bias=True).to_float(config.compute_type) + self.dropout = nn.Dropout(1 - dropout_prob) + self.reshape = P.Reshape() + self.shape = (-1, config.hidden_size) + self.use_crf = use_crf + self.origin_shape = (config.batch_size, config.seq_length, self.num_labels) + + def construct(self, input_ids, input_mask, token_type_id): + sequence_output, _, _ = \ + self.bert(input_ids, token_type_id, input_mask) + seq = self.dropout(sequence_output) + seq = self.reshape(seq, self.shape) + logits = self.dense_1(seq) + logits = self.cast(logits, self.dtype) + if self.use_crf: + return_value = self.reshape(logits, self.origin_shape) + else: + return_value = self.log_softmax(logits) + return return_value + +class CrossEntropyCalculation(nn.Cell): + """ + Cross Entropy loss + """ + def __init__(self, is_training=True): + super(CrossEntropyCalculation, self).__init__() + self.onehot = P.OneHot() + self.on_value = Tensor(1.0, mstype.float32) + self.off_value = Tensor(0.0, mstype.float32) + self.reduce_sum = P.ReduceSum() + self.reduce_mean = P.ReduceMean() + self.reshape = P.Reshape() + self.last_idx = (-1,) + self.neg = P.Neg() + self.cast = P.Cast() + self.is_training = is_training + + def construct(self, logits, label_ids, num_labels): + if self.is_training: + label_ids = self.reshape(label_ids, self.last_idx) + one_hot_labels = self.onehot(label_ids, num_labels, self.on_value, self.off_value) + per_example_loss = self.neg(self.reduce_sum(one_hot_labels * logits, self.last_idx)) + loss = self.reduce_mean(per_example_loss, self.last_idx) + return_value = self.cast(loss, mstype.float32) + else: + return_value = logits * 1.0 + return return_value + +class BertCLS(nn.Cell): + """ + Train interface for classification finetuning task. + """ + def __init__(self, config, is_training, num_labels=2, dropout_prob=0.0, use_one_hot_embeddings=False): + super(BertCLS, self).__init__() + self.bert = BertCLSModel(config, is_training, num_labels, dropout_prob, use_one_hot_embeddings) + self.loss = CrossEntropyCalculation(is_training) + self.num_labels = num_labels + def construct(self, input_ids, input_mask, token_type_id, label_ids): + log_probs = self.bert(input_ids, input_mask, token_type_id) + loss = self.loss(log_probs, label_ids, self.num_labels) + return loss + + +class BertNER(nn.Cell): + """ + Train interface for sequence labeling finetuning task. + """ + def __init__(self, config, is_training, num_labels=11, use_crf=False, tag_to_index=None, dropout_prob=0.0, + use_one_hot_embeddings=False): + super(BertNER, self).__init__() + self.bert = BertNERModel(config, is_training, num_labels, use_crf, dropout_prob, use_one_hot_embeddings) + if use_crf: + if not tag_to_index: + raise Exception("The dict for tag-index mapping should be provided for CRF.") + self.loss = CRF(tag_to_index, config.batch_size, config.seq_length, is_training) + else: + self.loss = CrossEntropyCalculation(is_training) + self.num_labels = num_labels + self.use_crf = use_crf + def construct(self, input_ids, input_mask, token_type_id, label_ids): + logits = self.bert(input_ids, input_mask, token_type_id) + if self.use_crf: + loss = self.loss(logits, label_ids) + else: + loss = self.loss(logits, label_ids, self.num_labels) + return loss diff --git a/project_1/1-Model_Optimization.ipynb b/project_1/1-Model_Optimization.ipynb new file mode 100644 index 0000000..2fc0003 --- /dev/null +++ b/project_1/1-Model_Optimization.ipynb @@ -0,0 +1,565 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "

调优模型和训练策略

\n", + "\n", + "[TOC]\n", + "\n", + "## 作业介绍\n", + "\n", + "模型调优和训练策略调优是当前深度学习领域最常见、最难和最耗费精力的工作,旨在降低训练难度,提高模型精度,减少模型大小,降低模型推理时延。本作业要求在给定LeNet5模型+CIFAR10数据集的基础上,对模型和训练策略进行调优,以验证精度、模型大小和推理时延为目标,优先级为精度>大小>时延。\n", + "\n", + "要求模型在CIFAR10验证集上的精度不低于60%,最终成绩可参考`0.50*精度(%) - 0.35*大小(MB) - 0.15*时延(ms)`的方式评定。\n", + "\n", + "## 作业目的\n", + "\n", + "- 了解当前深度学习研发人员最常见的工作;\n", + "- 了解并熟悉如何使用MindSpore进行模型开发和调试;\n", + "- 了解模型调优的基本方向和常用策略,了解模型深度(层数)、模型宽度(核大小)、特殊结构(Bypass)等概念,及其对模型精度、大小和时延的影响;\n", + "- 了解训练策略调优的常用方法,了解Epoch数、Batch Size、优化器、学习率、正则化项等对模型训练和精度的影响。\n", + "\n", + "## 预备知识\n", + "\n", + "- 熟练使用Python,了解Shell及Linux操作系统基本知识。\n", + "- 具备一定的深度学习理论知识,如卷积神经网络、损失函数、优化器,训练策略、Checkpoint等。\n", + "- 了解华为云的基本使用方法,包括[OBS(对象存储)](https://www.huaweicloud.com/product/obs.html)、[ModelArts(AI开发平台)](https://www.huaweicloud.com/product/modelarts.html)、[Notebook(开发工具)](https://support.huaweicloud.com/engineers-modelarts/modelarts_23_0033.html)等功能。华为云官网:https://www.huaweicloud.com\n", + "- 了解并熟悉MindSpore AI计算框架,MindSpore官网:https://www.mindspore.cn/\n", + "\n", + "## 开发环境\n", + "\n", + "- MindSpore 0.1.0(MindSpore版本会定期更新,本指导也会定期刷新,与版本配套);\n", + "- 华为云ModelArts:ModelArts是华为云提供的面向开发者的一站式AI开发平台,集成了昇腾AI处理器资源池,用户可以在该平台下体验MindSpore。ModelArts官网:https://www.huaweicloud.com/product/modelarts.html\n", + "\n", + "## 开发准备\n", + "\n", + "### 创建OBS桶\n", + "\n", + "本实验需要使用华为云OBS存储脚本和数据集,可以参考[快速通过OBS控制台上传下载文件](https://support.huaweicloud.com/qs-obs/obs_qs_0001.html)了解使用OBS创建桶、上传文件、下载文件的使用方法。\n", + "\n", + "> **提示:**华为云新用户使用OBS时通常需要创建和配置“访问密钥”,可以在使用OBS时根据提示完成创建和配置。也可以参考[获取访问密钥并完成ModelArts全局配置](https://support.huaweicloud.com/prepare-modelarts/modelarts_08_0002.html)获取并配置访问密钥。\n", + "\n", + "创建OBS桶的参考配置如下:\n", + "\n", + "- 区域:华北-北京四\n", + "- 数据冗余存储策略:单AZ存储\n", + "- 桶名称:如ms-course\n", + "- 存储类别:标准存储\n", + "- 桶策略:公共读\n", + "- 归档数据直读:关闭\n", + "- 企业项目、标签等配置:免\n", + "\n", + "### 数据集准备\n", + "\n", + "CIFAR-10是一个图片分类数据集,包含60000张32x32的彩色物体图片,训练集50000张,测试集10000张,共10类,每类6000张。CIFAR-10数据集的官网:[THE MNIST DATABASE](http://www.cs.toronto.edu/~kriz/cifar.html)。\n", + "\n", + "从CIFAR-10官网下载“CIFAR-10 binary version (suitable for C programs)”到本地并解压。\n", + "\n", + "### 脚本准备\n", + "\n", + "从[课程gitee仓库](https://gitee.com/mindspore/course)上下载对应的Jupyter Notebook(内容同本指导)。\n", + "\n", + "### 上传文件\n", + "\n", + "将脚本和数据集上传到OBS桶中,组织为如下形式:\n", + "\n", + "```\n", + "project_1\n", + "├── xxx.ipynb\n", + "└── cifar10\n", + " ├── batches.meta.txt\n", + " ├── eval\n", + " │   └── test_batch.bin\n", + " └── train\n", + " ├── data_batch_1.bin\n", + " ├── data_batch_2.bin\n", + " ├── data_batch_3.bin\n", + " ├── data_batch_4.bin\n", + " └── data_batch_5.bin\n", + "```\n", + "\n", + "### 创建并打开Notebook\n", + "\n", + "可以参考[创建并打开Notebook](https://support.huaweicloud.com/engineers-modelarts/modelarts_23_0034.html)来创建并打开上传的Notebook脚本。\n", + "\n", + "创建Notebook的参考配置:\n", + "\n", + "- 计费模式:按需计费\n", + "- 名称:project_1\n", + "- 工作环境:Python3\n", + "- 资源池:公共资源\n", + "- 类型:Ascend\n", + "- 规格:单卡1*Ascend 910\n", + "- 存储位置:对象存储服务(OBS)->选择上述新建的OBS桶中的project_1文件夹\n", + "- 自动停止等配置:默认\n", + "\n", + "> **注意:**\n", + "> - 打开Notebook前,在Jupyter Notebook文件列表页面,勾选目录里的所有文件/文件夹(脚本和数据集),并点击列表上方的“Sync OBS”按钮,使OBS桶中的所有文件同时同步到Notebook工作环境中,这样Notebook中的代码才能访问数据集。参考[使用Sync OBS功能](https://support.huaweicloud.com/engineers-modelarts/modelarts_23_0038.html)。\n", + "> - 打开Notebook后,选择MindSpore环境作为Kernel。\n", + "\n", + "> **提示:**上述数据集和脚本的准备工作也可以在Notebook环境中完成,在Jupyter Notebook文件列表页面,点击右上角的\"New\"->\"Terminal\",进入Notebook环境所在终端,进入`work`目录,可以使用常用的linux shell命令,如`wget, gzip, tar, mkdir, mv`等,完成数据集和脚本的下载和准备。\n", + "\n", + "## 作业内容\n", + "\n", + "作业基于上述打开的Notebook进行,进行作业前请确保完成了上述准备工作。\n", + "\n", + "> **提示:**请从上至下阅读提示并执行代码框进行体验。代码框执行过程中左侧呈现[\\*],代码框执行完毕后左侧呈现如[1],[2]等。请等上一个代码框执行完毕后再执行下一个代码框。\n", + "\n", + "导入MindSpore模块和辅助模块:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "# os.environ['DEVICE_ID'] = '0'\n", + "import time\n", + "\n", + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "\n", + "import mindspore as ms\n", + "import mindspore.context as context\n", + "import mindspore.dataset.transforms.c_transforms as C\n", + "import mindspore.dataset.transforms.vision.c_transforms as CV\n", + "\n", + "from mindspore.dataset.transforms.vision import Inter\n", + "from mindspore import nn, Tensor\n", + "from mindspore.train import Model\n", + "from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor\n", + "from mindspore.train.serialization import load_checkpoint, load_param_into_net\n", + "\n", + "import logging; logging.getLogger('matplotlib.font_manager').disabled = True\n", + "\n", + "context.set_context(mode=context.GRAPH_MODE, device_target='Ascend')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 数据处理\n", + "\n", + "对其中几张图片进行可视化,可以看到图片中的物体/动物,图片的大小为32x32。" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "DATA_DIR_TRAIN = \"/home/share/dataset/cifar-10-batches-bin/train\" # 训练集信息\n", + "DATA_DIR_TEST = \"/home/share/dataset/cifar-10-batches-bin/eval\" # 测试集信息\n", + "LABELS = \"/home/share/dataset/cifar-10-batches-bin/batches.meta.txt\" # 标签信息\n", + "\n", + "ds = ms.dataset.Cifar10Dataset(DATA_DIR_TRAIN)\n", + "ds = ds.create_dict_iterator()\n", + "with open(LABELS, \"r\") as f:\n", + " labels = [x.strip() for x in f.readlines()]\n", + "\n", + "for i in range(1, 10):\n", + " data = ds.get_next() \n", + " plt.subplot(3, 3, i)\n", + " plt.imshow(data['image'])\n", + " plt.title('%s' % labels[data['label']])\n", + " plt.xticks([])\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "在使用数据集训练网络前,首先需要对数据进行预处理,如下:" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "def create_dataset(training=True, num_epoch=1, batch_size=32, resize=(32, 32), rescale=1/255, shift=0, buffer_size=64):\n", + " ds = ms.dataset.Cifar10Dataset(DATA_DIR_TRAIN if training else DATA_DIR_TEST)\n", + " \n", + " # define map operations\n", + " if training:\n", + " random_crop_op = CV.RandomCrop((32,32), (4,4,4,4))\n", + " random_flip_op = CV.RandomHorizontalFlip()\n", + " ds = ds.map(input_columns=\"image\", operations=[random_crop_op, random_flip_op])\n", + " \n", + " resize_op = CV.Resize(resize) # Bilinear as default\n", + " rescale_op = CV.Rescale(rescale, shift)\n", + " normalize_op = CV.Normalize((0.4465, 0.4822, 0.4914), (0.2010, 0.1994, 0.2023))\n", + " changeswap_op = CV.HWC2CHW()\n", + " \n", + " # apply map operations on images\n", + " ds = ds.map(input_columns=\"image\", operations=[resize_op, rescale_op, normalize_op, changeswap_op])\n", + " ds = ds.map(input_columns=\"label\", operations=C.TypeCast(ms.int32))\n", + " \n", + " ds = ds.shuffle(buffer_size=buffer_size)\n", + " ds = ds.batch(batch_size, drop_remainder=True)\n", + " ds = ds.repeat(num_epoch)\n", + " \n", + " return ds" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 定义模型\n", + "\n", + "预置模型为LeNet5:" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "class MyNet(nn.Cell):\n", + " def __init__(self):\n", + " super(MyNet, self).__init__()\n", + " self.relu = nn.ReLU()\n", + " self.conv1 = nn.Conv2d(3, 6, 5, stride=1, pad_mode='valid')\n", + " self.conv2 = nn.Conv2d(6, 16, 5, stride=1, pad_mode='valid')\n", + " self.pool = nn.MaxPool2d(kernel_size=2, stride=2)\n", + " self.flatten = nn.Flatten()\n", + " self.fc1 = nn.Dense(400, 120)\n", + " self.fc2 = nn.Dense(120, 84)\n", + " self.fc3 = nn.Dense(84, 10)\n", + " \n", + " def construct(self, input_x):\n", + " output = self.conv1(input_x)\n", + " output = self.relu(output)\n", + " output = self.pool(output)\n", + " output = self.conv2(output)\n", + " output = self.relu(output)\n", + " output = self.pool(output)\n", + " output = self.flatten(output)\n", + " # print(output.shape()) # 仅Pynative模式时可用,Graph模式时请注释掉\n", + " output = self.fc1(output)\n", + " output = self.fc2(output)\n", + " output = self.fc3(output)\n", + " \n", + " return output" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "使用Pynative模式对网络进行调试,比如打印网络中某一层的输出shape:`print(output.shape())`。" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "context.set_context(mode=context.PYNATIVE_MODE)\n", + "x = Tensor(np.ones([1, 3, 32, 32]), ms.float32)\n", + "y = MyNet()(x)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "> **注意:**调试完毕后,需注释掉网络定义中`construct`里的打印语句:`print(output.shape())`,并将切换为Graph模式进行模型训练。" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "context.set_context(mode=context.GRAPH_MODE)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 模型训练\n", + "\n", + "一般情况下,模型训练时采用静态学习率,如0.01。随着训练步数的增加,模型逐渐趋于收敛,对权重参数的更新幅度应该逐渐降低,以减小模型训练后期的抖动。所以,模型训练时可以采用动态下降的学习率,常见的学习率下降策略有:\n", + "\n", + "- polynomial decay/square decay;\n", + "- cosine decay;\n", + "- exponential decay;\n", + "- stage decay." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "def build_lr(total_steps, decay_type='cosine', lr_base=0.1, lr_init=0.0, warmup_steps=0):\n", + " \"\"\"\n", + " Generate learning rate array.\n", + "\n", + " Args:\n", + " total_steps (int): Total steps to decay over.\n", + " decay_type (str): cosine, square\n", + " lr_base (float): Base learning rate. Default: 0.1.\n", + " lr_init (float): Initial learning rate for warmup. Default: 0.0.\n", + " warmup_steps (int): The number of warming up steps. Default: 5.\n", + "\n", + " Returns:\n", + " np.array, learning rate array.\n", + " \"\"\"\n", + " lr_base, lr_init = float(lr_base), float(lr_init)\n", + " lr_per_step = []\n", + " if warmup_steps != 0:\n", + " inc_per_step = (lr_base - lr_init) / warmup_steps\n", + " else:\n", + " inc_per_step = 0.0\n", + " for i in range(int(total_steps)):\n", + " if i < warmup_steps:\n", + " lr = lr_init + inc_per_step * i\n", + " else:\n", + " if decay_type == 'square':\n", + " frac = 1.0 - float(i - warmup_steps) / (total_steps - warmup_steps)\n", + " lr = lr_base * (frac * frac)\n", + " elif decay_type == 'exponential':\n", + " pass # 尝试实现\n", + " elif decay_type == 'cosine':\n", + " lr = 0.5 * lr_base * (1 + np.cos(np.pi * i / total_steps))\n", + " else:\n", + " raise\n", + " lr_per_step.append(lr)\n", + " \n", + " lr_per_step = np.array(lr_per_step).astype(np.float32)\n", + " return lr_per_step" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "观察不同学习率下降策略的曲线:" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "steps = 3*1562\n", + "plt.plot(range(steps), [0.1]*steps)\n", + "plt.plot(range(steps), build_lr(steps, decay_type='square', lr_base=0.1))\n", + "plt.plot(range(steps), build_lr(steps, decay_type='cosine', lr_base=0.1))\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "采用一定的训练策略对模型进行训练,并观察最终的验证精度。这里采用Momentum优化器 + cosine decay学习率下降策略。\n", + "\n", + "- cosine0.1_epoch100:{'acc': 0.5056089743589743, 'loss': 1.3536554261659965}\n", + "- square0.1_epoch100:{'acc': 0.5385616987179487, 'loss': 1.2873077663855674}\n", + "- const0.01_epoch100:{'acc': 0.5464743589743589, 'loss': 1.3035117800419147}\n", + "\n", + "- cosine0.1_epoch50:{'acc': 0.43900240384615385, 'loss': 1.5275637297294078}\n", + "- square0.1_epoch50:{'acc': 0.5453725961538461, 'loss': 1.2635320337154927}\n", + "- const0.01_epoch50:{'acc': 0.546073717948718, 'loss': 1.2912015158396501}\n", + "\n", + "- cosine0.01_epoch50:{'acc': 0.6571514423076923, 'loss': 0.9970117075703083}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "os.system('rm -f *.ckpt *.ir *.meta') # 清理旧的运行文件\n", + "LOOP_SINK = context.get_context('enable_loop_sink')\n", + "\n", + "def test_train(num_epoch=2, momentum=0.9, lr=0.01, decay_type='square', check_point_name=\"mynet\"):\n", + " ds_train = create_dataset(num_epoch=num_epoch)\n", + " ds_eval = create_dataset(training=False)\n", + " steps_per_epoch = ds_train.get_dataset_size()\n", + " \n", + " net = MyNet()\n", + " loss = nn.loss.SoftmaxCrossEntropyWithLogits(is_grad=False, sparse=True, reduction='mean')\n", + " if decay_type:\n", + " lr = build_lr(num_epoch*steps_per_epoch, decay_type=decay_type, lr_base=lr)\n", + " opt = nn.Momentum(net.trainable_params(), lr, momentum, weight_decay=0.0)\n", + " \n", + " ckpt_cfg = CheckpointConfig(save_checkpoint_steps=steps_per_epoch, keep_checkpoint_max=5)\n", + " ckpt_cb = ModelCheckpoint(prefix=check_point_name, config=ckpt_cfg)\n", + " loss_cb = LossMonitor(per_print_times=1 if LOOP_SINK else steps_per_epoch)\n", + " \n", + " model = Model(net, loss, opt, metrics={'acc', 'loss'})\n", + " model.train(num_epoch, ds_train, callbacks=[ckpt_cb, loss_cb], dataset_sink_mode=True)\n", + " metrics = model.eval(ds_eval)\n", + " print('Metrics:', metrics)\n", + "\n", + "test_train(num_epoch=50, lr=0.01, decay_type='cosine')\n", + "print('\\n'.join(sorted([x for x in os.listdir('.') if x.startswith('mynet')])))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 模型大小\n", + "\n", + "统计模型参数量,包括所有可训练的权重、偏置的参数。" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[('conv1.weight', 450), ('conv2.weight', 2400), ('fc1.weight', 48000), ('fc1.bias', 120), ('fc2.weight', 10080), ('fc2.bias', 84), ('fc3.weight', 840), ('fc3.bias', 10)]\n", + "Num params(M): 0.061984\n" + ] + } + ], + "source": [ + "params = MyNet().trainable_params()\n", + "print([(p.name, np.prod(p.data.shape())) for p in params])\n", + "num_params = sum([np.prod(p.data.shape()) for p in params])\n", + "print('Num params(M):', num_params/1e6)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 推理时延\n", + "\n", + "模型第一次执行推理时需要编译计算图和算子,通常时延较长,通常需要先进行预热(warmup),然后再循环推理多次,取多次推理时延的平均值作为模型的推理时延。" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Latency(ms): 1.290428638458252\n" + ] + } + ], + "source": [ + "x = Tensor(np.ones([1, 3, 32, 32]), ms.float32)\n", + "net = MyNet()\n", + "# 预热\n", + "for i in range(5):\n", + " y = net(x)\n", + "# 多次推理取平均值\n", + "start = time.time()\n", + "for i in range(100):\n", + " y = net(x)\n", + "end = time.time()\n", + "print('Latency(ms):', (end-start)/100 * 1000)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 作业结论\n", + "\n", + "预置训练策略,以及预置模型的精度、大小和时延如下:\n", + "\n", + "| batch size | number of epochs | learning rate | decay type | optimizer | number of parameters(M) | latency(ms) | acc(%) |\n", + "| -- | -- | -- | -- | -- | -- | -- | -- |\n", + "| 32 | 50 | 0.01 | cosine | Momentum 0.9 | 0.061984 | 1.290 | 65.7 |\n", + "\n", + "在预置模型和训练策略的基础上,请:\n", + "\n", + "- 尝试调整模型深度(层数)、模型宽度(核大小)、模型结构(Conv, MaxPool, AvgPool, FC, Bypass)等,并评估其对模型精度、大小和时延的影响;\n", + "- 尝试调整Epoch数、Batch Size、优化器、学习率、正则化项等,并评估其对模型训练和精度的影响。\n", + "\n", + "调优模型和训练策略的结果(请填写):\n", + "\n", + "| batch size | number of epochs | learning rate | decay type | optimizer | number of parameters(M) | latency(ms) | acc(%) |\n", + "| -- | -- | -- | -- | -- | -- | -- | -- |\n", + "| x | x | x | x | x | x | x | x |\n", + "\n", + "### 模型调优总结\n", + "\n", + "请填写\n", + "\n", + "### 训练策略调优总结\n", + "\n", + "请填写\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.5" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} -- GitLab