diff --git a/README.md b/README.md index bd505ded45889a6c3298fc4fb85f580e1ea0fc9e..cd6f5d3400709f1a607521a1a817849113b2deda 100644 --- a/README.md +++ b/README.md @@ -84,7 +84,8 @@ Dive into Deep Learning with PyTorch. ### 9. 计算机视觉 [9.1 图像增广](https://github.com/ShusenTang/Dive-into-DL-PyTorch/blob/master/docs/chapter09_computer-vision/9.1_image-augmentation.md) -[9.2 微调](https://github.com/ShusenTang/Dive-into-DL-PyTorch/blob/master/docs/chapter09_computer-vision/9.2_fine-tuning.md) +[9.2 微调](https://github.com/ShusenTang/Dive-into-DL-PyTorch/blob/master/docs/chapter09_computer-vision/9.2_fine-tuning.md) +[9.3 目标检测和边界框](https://github.com/ShusenTang/Dive-into-DL-PyTorch/blob/master/docs/chapter09_computer-vision/9.3_bounding-box.md) 持续更新中...... diff --git a/code/chapter09_computer-vision/9.3_bounding-box.ipynb b/code/chapter09_computer-vision/9.3_bounding-box.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..599ebb086924c6c7c7a02430093fac4b23093816 --- /dev/null +++ b/code/chapter09_computer-vision/9.3_bounding-box.ipynb @@ -0,0 +1,944 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 9.3 目标检测和边界框" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "%matplotlib inline\n", + "from PIL import Image\n", + "\n", + "import sys\n", + "sys.path.append(\"..\") \n", + "import d2lzh_pytorch as d2l" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "image/svg+xml": [ + "\n", + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "d2l.set_figsize()\n", + "img = Image.open('../../img/catdog.jpg')\n", + "d2l.plt.imshow(img); # 加分号只显示图" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 9.3.1 边界框" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# bbox是bounding box的缩写\n", + "dog_bbox, cat_bbox = [60, 45, 378, 516], [400, 112, 655, 493]" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "def bbox_to_rect(bbox, color): # 本函数已保存在d2lzh_pytorch中方便以后使用\n", + " # 将边界框(左上x, 左上y, 右下x, 右下y)格式转换成matplotlib格式:\n", + " # ((左上x, 左上y), 宽, 高)\n", + " return d2l.plt.Rectangle(\n", + " xy=(bbox[0], bbox[1]), width=bbox[2]-bbox[0], height=bbox[3]-bbox[1],\n", + " fill=False, edgecolor=color, linewidth=2)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "image/svg+xml": [ + "\n", + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "fig = d2l.plt.imshow(img)\n", + "fig.axes.add_patch(bbox_to_rect(dog_bbox, 'blue'))\n", + "fig.axes.add_patch(bbox_to_rect(cat_bbox, 'red'));" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python [conda env:anaconda3]", + "language": "python", + "name": "conda-env-anaconda3-py" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.8" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/code/d2lzh_pytorch/utils.py b/code/d2lzh_pytorch/utils.py index 27876f5ee24205df57925357a75bf0947c432135..f4d1bbf74b49fcddad9422a9480f20ce81ab12f4 100644 --- a/code/d2lzh_pytorch/utils.py +++ b/code/d2lzh_pytorch/utils.py @@ -690,7 +690,6 @@ def show_images(imgs, num_rows, num_cols, scale=2): axes[i][j].axes.get_yaxis().set_visible(False) return axes -# 本函数已保存在d2lzh_pytorch包中方便以后使用 def train(train_iter, test_iter, net, loss, optimizer, device, num_epochs): net = net.to(device) print("training on ", device) @@ -712,3 +711,15 @@ def train(train_iter, test_iter, net, loss, optimizer, device, num_epochs): test_acc = evaluate_accuracy(test_iter, net) print('epoch %d, loss %.4f, train acc %.3f, test acc %.3f, time %.1f sec' % (epoch + 1, train_l_sum / batch_count, train_acc_sum / n, test_acc, time.time() - start)) + + + + + +############################## 9.3 ##################### +def bbox_to_rect(bbox, color): + # 将边界框(左上x, 左上y, 右下x, 右下y)格式转换成matplotlib格式: + # ((左上x, 左上y), 宽, 高) + return d2l.plt.Rectangle( + xy=(bbox[0], bbox[1]), width=bbox[2]-bbox[0], height=bbox[3]-bbox[1], + fill=False, edgecolor=color, linewidth=2) \ No newline at end of file diff --git a/docs/chapter09_computer-vision/9.3_bounding-box.md b/docs/chapter09_computer-vision/9.3_bounding-box.md new file mode 100644 index 0000000000000000000000000000000000000000..8511466eed0aa5272199d217a957f4e36eb030f3 --- /dev/null +++ b/docs/chapter09_computer-vision/9.3_bounding-box.md @@ -0,0 +1,67 @@ +# 9.3 目标检测和边界框 + +在前面的一些章节中,我们介绍了诸多用于图像分类的模型。在图像分类任务里,我们假设图像里只有一个主体目标,并关注如何识别该目标的类别。然而,很多时候图像里有多个我们感兴趣的目标,我们不仅想知道它们的类别,还想得到它们在图像中的具体位置。在计算机视觉里,我们将这类任务称为目标检测(object detection)或物体检测。 + +目标检测在多个领域中被广泛使用。例如,在无人驾驶里,我们需要通过识别拍摄到的视频图像里的车辆、行人、道路和障碍的位置来规划行进线路。机器人也常通过该任务来检测感兴趣的目标。安防领域则需要检测异常目标,如歹徒或者炸弹。 + +在接下来的几节里,我们将介绍目标检测里的多个深度学习模型。在此之前,让我们来了解目标位置这个概念。先导入实验所需的包或模块。 + +``` python +%matplotlib inline +from PIL import Image + +import sys +sys.path.append("..") +import d2lzh_pytorch as d2l +``` + +下面加载本节将使用的示例图像。可以看到图像左边是一只狗,右边是一只猫。它们是这张图像里的两个主要目标。 + +``` python +d2l.set_figsize() +img = Image.open('../../img/catdog.jpg') +d2l.plt.imshow(img); # 加分号只显示图 +``` +
+ +
+ +## 9.3.1 边界框 + +在目标检测里,我们通常使用边界框(bounding box)来描述目标位置。边界框是一个矩形框,可以由矩形左上角的$x$和$y$轴坐标与右下角的$x$和$y$轴坐标确定。我们根据上面的图的坐标信息来定义图中狗和猫的边界框。图中的坐标原点在图像的左上角,原点往右和往下分别为$x$轴和$y$轴的正方向。 + +``` python +# bbox是bounding box的缩写 +dog_bbox, cat_bbox = [60, 45, 378, 516], [400, 112, 655, 493] +``` + +我们可以在图中将边界框画出来,以检查其是否准确。画之前,我们定义一个辅助函数`bbox_to_rect`。它将边界框表示成matplotlib的边界框格式。 + +``` python +def bbox_to_rect(bbox, color): # 本函数已保存在d2lzh_pytorch中方便以后使用 + # 将边界框(左上x, 左上y, 右下x, 右下y)格式转换成matplotlib格式: + # ((左上x, 左上y), 宽, 高) + return d2l.plt.Rectangle( + xy=(bbox[0], bbox[1]), width=bbox[2]-bbox[0], height=bbox[3]-bbox[1], + fill=False, edgecolor=color, linewidth=2) +``` + +我们将边界框加载在图像上,可以看到目标的主要轮廓基本在框内。 + +``` python +fig = d2l.plt.imshow(img) +fig.axes.add_patch(bbox_to_rect(dog_bbox, 'blue')) +fig.axes.add_patch(bbox_to_rect(cat_bbox, 'red')); +``` +输出: +
+ +
+ +## 小结 + +* 在目标检测里不仅需要找出图像里面所有感兴趣的目标,而且要知道它们的位置。位置一般由矩形边界框来表示。 + + +----------- +> 注:除代码外本节与原书基本相同,[原书传送门](https://zh.d2l.ai/chapter_computer-vision/bounding-box.html) diff --git a/img/catdog.jpg b/img/catdog.jpg new file mode 100644 index 0000000000000000000000000000000000000000..4471a714dc01726531128002517170a0140d4de5 Binary files /dev/null and b/img/catdog.jpg differ diff --git a/img/chapter09/9.3_output1.png b/img/chapter09/9.3_output1.png new file mode 100644 index 0000000000000000000000000000000000000000..9a57d44158bafbbd7c5def1bd51be6f7a5cc846e Binary files /dev/null and b/img/chapter09/9.3_output1.png differ diff --git a/img/chapter09/9.3_output2.png b/img/chapter09/9.3_output2.png new file mode 100644 index 0000000000000000000000000000000000000000..9d361b669f356cfc33ee77ca0fa99b16b00ec724 Binary files /dev/null and b/img/chapter09/9.3_output2.png differ