From 001222f10481c54f2faf04d0085c6c27da2a8224 Mon Sep 17 00:00:00 2001 From: Aston Zhang Date: Tue, 6 Nov 2018 20:04:23 +0000 Subject: [PATCH] rm op --- chapter_computer-vision/ssd.md | 111 --------------------------------- 1 file changed, 111 deletions(-) diff --git a/chapter_computer-vision/ssd.md b/chapter_computer-vision/ssd.md index cf5e6d5..d197baa 100644 --- a/chapter_computer-vision/ssd.md +++ b/chapter_computer-vision/ssd.md @@ -58,19 +58,6 @@ y2 = forward(nd.zeros((2, 16, 10, 10)), cls_predictor(3, 10)) (y1.shape, y2.shape) ``` -```{.json .output n=3} -[ - { - "data": { - "text/plain": "((2, 55, 20, 20), (2, 33, 10, 10))" - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } -] -``` - 预测的输出格式为(批量大小,通道数,高,宽)。可以看到除了批量大小外,其他维度大小均不一样。我们需要将它们变形成统一的格式并将多尺度的输出合并起来,让后续的处理变得简单。 我们首先将通道,即预测结果,放到最后。因为不同尺度下批量大小保持不变,所以将结果转成二维的(批量大小,高$\times$宽$\times$通道数)格式,方便之后的拼接。 @@ -93,19 +80,6 @@ def concat_preds(preds): concat_preds([y1, y2]).shape ``` -```{.json .output n=6} -[ - { - "data": { - "text/plain": "(2, 25300)" - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } -] -``` - ### 减半模块 减半模块将输入高宽减半来得到不同尺度的特征,这是通过步幅2的$2\times2$最大池化层来完成。我们前面提到因为预测层的窗口为3,所以我们需要额外卷积层来扩大其作用窗口来有效覆盖锚框区域。为此我们加入两个$3\times3$卷积层,每个卷积层后接批量归一化层和ReLU激活层。这样,一个尺度上的$3\times3$窗口覆盖了上一个尺度上的$10\times10$窗口。 @@ -127,19 +101,6 @@ def down_sample_blk(num_channels): forward(nd.zeros((2, 3, 20, 20)), down_sample_blk(10)).shape ``` -```{.json .output n=8} -[ - { - "data": { - "text/plain": "(2, 10, 10, 10)" - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } -] -``` - ### 主体网络 主体网络用来从原始图像抽取特征,一般会选择常用的深度卷积神经网络。例如[1]中使用了VGG,大家也常用ResNet替代。本小节为了计算简单,我们构造一个小的主体网络。网络中叠加三个减半模块,输出通道数从16开始,之后每个模块对其翻倍。 @@ -154,19 +115,6 @@ def base_net(): forward(nd.zeros((2, 3, 256, 256)), base_net()).shape ``` -```{.json .output n=9} -[ - { - "data": { - "text/plain": "(2, 64, 32, 32)" - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } -] -``` - ### 完整的模型 我们已经介绍了SSD模型中的各个功能模块,现在我们将构建整个模型。这个模型有五个模块,每个模块对输入进行特征抽取,并且预测锚框的类和偏移。第一个模块使用主体网络,第二到四模块使用减半模块,最后一个模块则使用全局的最大池化层来将高宽降到1。下面函数定义如何构建这些模块。 @@ -237,16 +185,6 @@ print('output class predictions:', cls_preds.shape) print('output box predictions:', bbox_preds.shape) ``` -```{.json .output n=13} -[ - { - "name": "stdout", - "output_type": "stream", - "text": "output anchors: (1, 5444, 4)\noutput class predictions: (32, 5444, 2)\noutput box predictions: (32, 21776)\n" - } -] -``` - ## 训练 下面我们描述如何一步步训练SSD模型来进行目标检测。 @@ -331,16 +269,6 @@ for epoch in range(20): epoch + 1, 1 - acc / (i + 1), mae / (i + 1), time.time() - start)) ``` -```{.json .output n=19} -[ - { - "name": "stdout", - "output_type": "stream", - "text": "epoch 5, class err 2.90e-03, bbox mae 2.98e-03, time 8.1 sec\nepoch 10, class err 2.54e-03, bbox mae 2.74e-03, time 8.1 sec\nepoch 15, class err 2.22e-03, bbox mae 2.49e-03, time 8.2 sec\nepoch 20, class err 2.30e-03, bbox mae 2.47e-03, time 8.2 sec\n" - } -] -``` - ## 预测 在预测阶段,我们希望能把图像里面所有感兴趣的目标找出来。我们首先定义一个图像预处理函数,它对图像进行变换然后转成卷积层需要的四维格式。 @@ -382,19 +310,6 @@ def display(img, output, threshold): display(img, output, threshold=0.3) ``` -```{.json .output n=22} -[ - { - "data": { - "image/svg+xml": "\n\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n", - "text/plain": "
" - }, - "metadata": {}, - "output_type": "display_data" - } -] -``` - ## 小结 * SSD在多尺度上对每个锚框同时预测类别以及与真实边界框的位移来进行目标检测。 @@ -429,19 +344,6 @@ for l, s in zip(lines, sigmas): gb.plt.legend(); ``` -```{.json .output n=23} -[ - { - "data": { - "image/svg+xml": "\n\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n", - "text/plain": "
" - }, - "metadata": {}, - "output_type": "display_data" - } -] -``` - 对于类别预测我们使用了交叉熵损失。假设对真实类别$j$的概率预测是$p_j$,交叉熵损失为$\log(p_j)$。我们可以使用一个被称为关注损失(focal loss)的函数来对之稍微变形。给定正的$\gamma$和$\alpha$,它的定义是 $$ - \alpha (1-p_j)^{\gamma} \log(p_j) $$ @@ -459,19 +361,6 @@ for l, gamma in zip(lines, [0, 1, 5]): gb.plt.legend(); ``` -```{.json .output n=24} -[ - { - "data": { - "image/svg+xml": "\n\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n", - "text/plain": "
" - }, - "metadata": {}, - "output_type": "display_data" - } -] -``` - ### 训练和预测 * 当目标在图像中占比很小时,我们通常会使用比较大的输入图像尺寸。 -- GitLab