From 3b2073d2c096f82ade363595c2f86eac75f16ed4 Mon Sep 17 00:00:00 2001 From: Aston Zhang Date: Wed, 19 Dec 2018 00:55:27 +0000 Subject: [PATCH] Revert "rm ssd output" This reverts commit 1d5581518880403d4d2975903f1955b3582c9593. --- chapter_computer-vision/ssd.md | 111 +++++++++++++++++++++++++++++++++ 1 file changed, 111 insertions(+) diff --git a/chapter_computer-vision/ssd.md b/chapter_computer-vision/ssd.md index cade457..7d1b7a5 100644 --- a/chapter_computer-vision/ssd.md +++ b/chapter_computer-vision/ssd.md @@ -57,6 +57,19 @@ Y2 = forward(nd.zeros((2, 16, 10, 10)), cls_predictor(3, 10)) (Y1.shape, Y2.shape) ``` +```{.json .output n=3} +[ + { + "data": { + "text/plain": "((2, 55, 20, 20), (2, 33, 10, 10))" + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } +] +``` + 通道维包含中心相同的锚框的预测结果。我们首先将通道维移到最后一维。因为不同尺度下批量大小仍保持不变,我们可以将预测结果转成二维的(批量大小,高$\times$宽$\times$通道数)的格式,以方便之后在维度1上的连结。 ```{.python .input n=4} @@ -73,6 +86,19 @@ def concat_preds(preds): concat_preds([Y1, Y2]).shape ``` +```{.json .output n=5} +[ + { + "data": { + "text/plain": "(2, 25300)" + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } +] +``` + ### 高和宽减半块 为了在多尺度检测目标,下面定义高和宽减半块`down_sample_blk`。它串联了两个填充为1的$3\times3$卷积层和步幅为2的$2\times2$最大池化层。我们知道,填充为1的$3\times3$卷积层不改变特征图的形状,而后面的池化层直接将特征图的高和宽减半。由于$1\times 2+(3-1)+(3-1)=6$,输出特征图中每个单元在输入特征图上的感受野形状为$6\times6$。可以看出,高和宽减半块使得输出特征图中每个单元的感受野变得更广阔。 @@ -94,6 +120,19 @@ def down_sample_blk(num_channels): forward(nd.zeros((2, 3, 20, 20)), down_sample_blk(10)).shape ``` +```{.json .output n=7} +[ + { + "data": { + "text/plain": "(2, 10, 10, 10)" + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } +] +``` + ### 基础网络块 基础网络块用来从原始图像抽取特征。为了计算简洁,我们在这里构造一个小的基础网络。该网络串联三个高和宽减半块,并逐步将通道数翻倍。当输入的原始图像的形状为$256\times256$时,基础网络块输出的特征图的形状为$32 \times 32$。 @@ -108,6 +147,19 @@ def base_net(): forward(nd.zeros((2, 3, 256, 256)), base_net()).shape ``` +```{.json .output n=8} +[ + { + "data": { + "text/plain": "(2, 64, 32, 32)" + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } +] +``` + ### 完整的模型 SSD模型一共包含五个模块,每个模块输出的特征图既用来生成锚框,又用来预测这些锚框的类别和偏移量。第一模块为基础网络块,第二至第四模块为高和宽减半块,第五模块使用全局最大池化层将高和宽降到1。因此第二至第五模块均为图9.4中的多尺度特征块。 @@ -183,6 +235,16 @@ print('output class preds:', cls_preds.shape) print('output bbox preds:', bbox_preds.shape) ``` +```{.json .output n=13} +[ + { + "name": "stdout", + "output_type": "stream", + "text": "output anchors: (1, 5444, 4)\noutput class preds: (32, 5444, 2)\noutput bbox preds: (32, 21776)\n" + } +] +``` + ## 训练 下面我们描述如何一步步训练SSD模型来进行目标检测。 @@ -260,6 +322,16 @@ for epoch in range(20): epoch + 1, 1 - acc / (i + 1), mae / (i + 1), time.time() - start)) ``` +```{.json .output n=18} +[ + { + "name": "stdout", + "output_type": "stream", + "text": "epoch 5, class err 3.02e-03, bbox mae 3.31e-03, time 8.9 sec\nepoch 10, class err 2.69e-03, bbox mae 2.90e-03, time 8.8 sec\nepoch 15, class err 2.68e-03, bbox mae 2.85e-03, time 8.9 sec\nepoch 20, class err 2.62e-03, bbox mae 2.65e-03, time 8.8 sec\n" + } +] +``` + ## 预测 在预测阶段,我们希望能把图像里面所有感兴趣的目标检测出来。下面读取测试图像,将其变换尺寸,然后转成卷积层需要的四维格式。 @@ -301,6 +373,19 @@ def display(img, output, threshold): display(img, output, threshold=0.3) ``` +```{.json .output n=21} +[ + { + "data": { + "image/svg+xml": "\n\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n", + "text/plain": "
" + }, + "metadata": {}, + "output_type": "display_data" + } +] +``` + ## 小结 * SSD是一个多尺度的目标检测模型。该模型基于基础网络块和各个多尺度特征块生成不同数量和不同大小的锚框,并通过预测锚框的类别和偏移量检测不同大小的目标。 @@ -339,6 +424,19 @@ for l, s in zip(lines, sigmas): gb.plt.legend(); ``` +```{.json .output n=22} +[ + { + "data": { + "image/svg+xml": "\n\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n", + "text/plain": "
" + }, + "metadata": {}, + "output_type": "display_data" + } +] +``` + 在类别预测时,实验中使用了交叉熵损失:设真实类别$j$的预测概率是$p_j$,交叉熵损失为$-\log p_j$。我们还可以使用焦点损失(focal loss)[2]:给定正的超参数$\gamma$和$\alpha$,该损失的定义为 $$ - \alpha (1-p_j)^{\gamma} \log p_j.$$ @@ -356,6 +454,19 @@ for l, gamma in zip(lines, [0, 1, 5]): gb.plt.legend(); ``` +```{.json .output n=23} +[ + { + "data": { + "image/svg+xml": "\n\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n", + "text/plain": "
" + }, + "metadata": {}, + "output_type": "display_data" + } +] +``` + ### 训练和预测 * 当目标在图像中占比较小时,模型通常会采用比较大的输入图像尺寸。 -- GitLab