Merge branch 'master' of https://github.com/ShusenTang/Dive-into-DL-PyTorch

merge from ubuntu

Merge branch 'master' of https://github.com/ShusenTang/Dive-into-DL-PyTorch
merge from ubuntu
43ba8dc3 · shusentang · d99e947b · 174ffd96 · 43ba8dc3 · 43ba8dc3
5 changed file
--- a/README.md
+++ b/README.md
@@ -80,6 +80,7 @@ Dive into Deep Learning with PyTorch.
 [8.1 命令式和符号式混合编程](https://github.com/ShusenTang/Dive-into-DL-PyTorch/blob/master/docs/chapter08_computational-performance/8.1_hybridize.md)      
 [8.2 异步计算](https://github.com/ShusenTang/Dive-into-DL-PyTorch/blob/master/docs/chapter08_computational-performance/8.2_async-computation.md)         
 [8.3 自动并行计算](https://github.com/ShusenTang/Dive-into-DL-PyTorch/blob/master/docs/chapter08_computational-performance/8.3_auto-parallelism.md)    
+[8.4 多GPU计算](https://github.com/ShusenTang/Dive-into-DL-PyTorch/blob/master/docs/chapter08_computational-performance/8.4_multiple-gpus.md)      


 持续更新中......

--- a/code/d2lzh_pytorch/utils.py
+++ b/code/d2lzh_pytorch/utils.py
@@ -204,8 +204,8 @@ def corr2d(X, K):


 # ############################ 5.5 #########################
-def evaluate_accuracy(data_iter, net):
-    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+def evaluate_accuracy(data_iter, net, 
+        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')):
    acc_sum, n = 0.0, 0
    with torch.no_grad():
        for X, y in data_iter:
@@ -222,7 +222,7 @@ def evaluate_accuracy(data_iter, net):
            n += y.shape[0]
    return acc_sum / n

-
+# TODO: 最后train_l_sum不应该除以n, 因为l已经在一个batch算过平均了, 所以最后train_l_sum应该除以batch数
 def train_ch5(net, train_iter, test_iter, batch_size, optimizer, device, num_epochs):
    net = net.to(device)
    print("training on ", device)
@@ -648,3 +648,18 @@ class Benchmark():

    def __exit__(self, *args):
        print('%stime: %.4f sec' % (self.prefix, time.time() - self.start))
+
+
+
+
+
+# ########################### 9.1 ########################################
+def show_images(imgs, num_rows, num_cols, scale=2):
+    figsize = (num_cols * scale, num_rows * scale)
+    _, axes = d2l.plt.subplots(num_rows, num_cols, figsize=figsize)
+    for i in range(num_rows):
+        for j in range(num_cols):
+            axes[i][j].imshow(imgs[i * num_cols + j])
+            axes[i][j].axes.get_xaxis().set_visible(False)
+            axes[i][j].axes.get_yaxis().set_visible(False)
+    return axes
\ No newline at end of file
--- a/docs/chapter05_CNN/5.5_lenet.md
+++ b/docs/chapter05_CNN/5.5_lenet.md
@@ -101,9 +101,9 @@ train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size=batch_size)
 因为卷积神经网络计算比多层感知机要复杂，建议使用GPU来加速计算。因此，我们对3.6节（softmax回归的从零开始实现）中描述的`evaluate_accuracy`函数略作修改，使其支持GPU计算。

 ``` python
-# 本函数已保存在d2lzh_pytorch包中方便以后使用。该函数将被逐步改进：它的完整实现将在“图像增广”一节中描述
-def evaluate_accuracy(data_iter, net):
-    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+# 本函数已保存在d2lzh_pytorch包中方便以后使用。该函数将被逐步改进。
+def evaluate_accuracy(data_iter, net, 
+        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')):
    acc_sum, n = 0.0, 0
    with torch.no_grad():
        for X, y in data_iter:
@@ -124,6 +124,7 @@ def evaluate_accuracy(data_iter, net):
 我们同样对3.6节中定义的`train_ch3`函数略作修改，确保计算使用的数据和模型同在内存或显存上。

 ``` python
+# TODO: 最后train_l_sum不应该除以n, 因为l已经在一个batch算过平均了, 所以最后train_l_sum应该除以batch数
 # 本函数已保存在d2lzh_pytorch包中方便以后使用
 def train_ch5(net, train_iter, test_iter, batch_size, optimizer, device, num_epochs):
    net = net.to(device)

--- a/docs/chapter05_CNN/5.7_vgg.md
+++ b/docs/chapter05_CNN/5.7_vgg.md
@@ -28,7 +28,7 @@ def vgg_block(num_convs, in_channels, out_channels):
        else:
            blk.append(nn.Conv2d(out_channels, out_channels, kernel_size=3, padding=1))
        blk.append(nn.ReLU())
-    blk.append(nn.MaxPool2d(kernel_size=2, stride=2))
+    blk.append(nn.MaxPool2d(kernel_size=2, stride=2)) # 这里会使宽高减半
    return nn.Sequential(*blk)
 ```

@@ -40,7 +40,8 @@ def vgg_block(num_convs, in_channels, out_channels):

 ``` python
 conv_arch = ((1, 1, 64), (1, 64, 128), (2, 128, 256), (2, 256, 512), (2, 512, 512))
-fc_features = 512 * 7 * 7 # 根据卷积层的输出算出来的
+# 经过5个vgg_block, 宽高会减半5次, 变成 224/32 = 7
+fc_features = 512 * 7 * 7 # c * w * h
 fc_hidden_units = 4096 # 任意
 ```

@@ -51,6 +52,7 @@ def vgg(conv_arch, fc_features, fc_hidden_units=4096):
    net = nn.Sequential()
    # 卷积层部分
    for i, (num_convs, in_channels, out_channels) in enumerate(conv_arch):
+        # 每经过一个vgg_block都会使宽高减半
        net.add_module("vgg_block_" + str(i+1), vgg_block(num_convs, in_channels, out_channels))
    # 全连接层部分
    net.add_module("fc", nn.Sequential(d2l.FlattenLayer(),

--- a/docs/chapter09_computer-vision/9.1_image-augmentation.md
+++ b/docs/chapter09_computer-vision/9.1_image-augmentation.md
+# 9.1 图像增广
+
+在5.6节（深度卷积神经网络）里我们提到过，大规模数据集是成功应用深度神经网络的前提。图像增广（image augmentation）技术通过对训练图像做一系列随机改变，来产生相似但又不同的训练样本，从而扩大训练数据集的规模。图像增广的另一种解释是，随机改变训练样本可以降低模型对某些属性的依赖，从而提高模型的泛化能力。例如，我们可以对图像进行不同方式的裁剪，使感兴趣的物体出现在不同位置，从而减轻模型对物体出现位置的依赖性。我们也可以调整亮度、色彩等因素来降低模型对色彩的敏感度。可以说，在当年AlexNet的成功中，图像增广技术功不可没。本节我们将讨论这个在计算机视觉里被广泛使用的技术。
+
+首先，导入实验所需的包或模块。
+
+``` python
+%matplotlib inline
+import time
+import torch
+from torch import nn, optim
+from torch.utils.data import Dataset, DataLoader
+import torchvision
+from PIL import Image
+
+import sys
+sys.path.append("..") 
+import d2lzh_pytorch as d2l
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+```
+
+## 9.1.1 常用的图像增广方法
+
+我们来读取一张形状为$400\times 500$（高和宽分别为400像素和500像素）的图像作为实验的样例。
+
+``` python
+d2l.set_figsize()
+img = Image.open('../../img/cat1.jpg')
+d2l.plt.imshow(img)
+```
+
+下面定义绘图函数`show_images`。
+
+``` python
+# 本函数已保存在d2lzh_pytorch包中方便以后使用
+def show_images(imgs, num_rows, num_cols, scale=2):
+    figsize = (num_cols * scale, num_rows * scale)
+    _, axes = d2l.plt.subplots(num_rows, num_cols, figsize=figsize)
+    for i in range(num_rows):
+        for j in range(num_cols):
+            axes[i][j].imshow(imgs[i * num_cols + j])
+            axes[i][j].axes.get_xaxis().set_visible(False)
+            axes[i][j].axes.get_yaxis().set_visible(False)
+    return axes
+```
+
+大部分图像增广方法都有一定的随机性。为了方便观察图像增广的效果，接下来我们定义一个辅助函数`apply`。这个函数对输入图像`img`多次运行图像增广方法`aug`并展示所有的结果。
+
+``` python
+def apply(img, aug, num_rows=2, num_cols=4, scale=1.5):
+    Y = [aug(img) for _ in range(num_rows * num_cols)]
+    show_images(Y, num_rows, num_cols, scale)
+```
+
+### 9.1.1.1 翻转和裁剪
+
+左右翻转图像通常不改变物体的类别。它是最早也是最广泛使用的一种图像增广方法。下面我们通过`torchvision.transforms`模块创建`RandomHorizontalFlip`实例来实现一半概率的图像水平（左右）翻转。
+
+``` python
+apply(img, torchvision.transforms.RandomHorizontalFlip())
+```
+
+上下翻转不如左右翻转通用。但是至少对于样例图像，上下翻转不会造成识别障碍。下面我们创建`RandomFlipTopBottom`实例来实现一半概率的图像垂直（上下）翻转。
+
+``` python
+apply(img, torchvision.transforms.RandomVerticalFlip())
+```
+
+在我们使用的样例图像里，猫在图像正中间，但一般情况下可能不是这样。在5.4节（池化层）里我们解释了池化层能降低卷积层对目标位置的敏感度。除此之外，我们还可以通过对图像随机裁剪来让物体以不同的比例出现在图像的不同位置，这同样能够降低模型对目标位置的敏感性。
+
+在下面的代码里，我们每次随机裁剪出一块面积为原面积$10\% \sim 100\%$的区域，且该区域的宽和高之比随机取自$0.5 \sim 2$，然后再将该区域的宽和高分别缩放到200像素。若无特殊说明，本节中$a$和$b$之间的随机数指的是从区间$[a,b]$中随机均匀采样所得到的连续值。
+
+``` python
+shape_aug = torchvision.transforms.RandomResizedCrop(200, scale=(0.1, 1), ratio=(0.5, 2))
+apply(img, shape_aug)
+```
+
+### 9.1.1.2 变化颜色
+
+另一类增广方法是变化颜色。我们可以从4个方面改变图像的颜色：亮度、对比度、饱和度和色调。在下面的例子里，我们将图像的亮度随机变化为原图亮度的$50\%$（$1-0.5$）$\sim 150\%$（$1+0.5$）。
+
+``` python
+apply(img, torchvision.transforms.ColorJitter(brightness=0.5))
+```
+
+类似地，我们也可以随机变化图像的色调。
+
+``` python
+apply(img, torchvision.transforms.ColorJitter(hue=0.5))
+```
+
+我们也可以同时设置如何随机变化图像的亮度（`brightness`）、对比度（`contrast`）、饱和度（`saturation`）和色调（`hue`）。
+
+``` python
+color_aug = torchvision.transforms.ColorJitter(
+    brightness=0.5, contrast=0.5, saturation=0.5, hue=0.5)
+apply(img, color_aug)
+```
+
+### 9.1.1.3 叠加多个图像增广方法
+
+实际应用中我们会将多个图像增广方法叠加使用。我们可以通过`Compose`实例将上面定义的多个图像增广方法叠加起来，再应用到每张图像之上。
+
+```{.python .input  n=31}
+augs = gdata.vision.transforms.Compose([
+    gdata.vision.transforms.RandomFlipLeftRight(), color_aug, shape_aug])
+apply(img, augs)
+```
+
+## 9.1.2 使用图像增广训练模型
+
+下面我们来看一个将图像增广应用在实际训练中的例子。这里我们使用CIFAR-10数据集，而不是之前我们一直使用的Fashion-MNIST数据集。这是因为Fashion-MNIST数据集中物体的位置和尺寸都已经经过归一化处理，而CIFAR-10数据集中物体的颜色和大小区别更加显著。下面展示了CIFAR-10数据集中前32张训练图像。
+
+```{.python .input  n=32}
+show_images(gdata.vision.CIFAR10(train=True)[0:32][0], 4, 8, scale=0.8);
+```
+
+为了在预测时得到确定的结果，我们通常只将图像增广应用在训练样本上，而不在预测时使用含随机操作的图像增广。在这里我们只使用最简单的随机左右翻转。此外，我们使用`ToTensor`实例将小批量图像转成MXNet需要的格式，即形状为(批量大小, 通道数, 高, 宽)、值域在0到1之间且类型为32位浮点数。
+
+```{.python .input  n=33}
+flip_aug = gdata.vision.transforms.Compose([
+    gdata.vision.transforms.RandomFlipLeftRight(),
+    gdata.vision.transforms.ToTensor()])
+
+no_aug = gdata.vision.transforms.Compose([
+    gdata.vision.transforms.ToTensor()])
+```
+
+接下来我们定义一个辅助函数来方便读取图像并应用图像增广。Gluon的数据集提供的`transform_first`函数将图像增广应用在每个训练样本（图像和标签）的第一个元素，即图像之上。有关`DataLoader`的详细介绍，可参考更早的[“图像分类数据集（Fashion-MNIST）”](../chapter_deep-learning-basics/fashion-mnist.md)一节。
+
+```{.python .input  n=34}
+num_workers = 0 if sys.platform.startswith('win32') else 4
+def load_cifar10(is_train, augs, batch_size):
+    return gdata.DataLoader(
+        gdata.vision.CIFAR10(train=is_train).transform_first(augs),
+        batch_size=batch_size, shuffle=is_train, num_workers=num_workers)
+```
+
+### 9.1.2.1 使用多GPU训练模型
+
+我们在CIFAR-10数据集上训练[“残差网络（ResNet）”](../chapter_convolutional-neural-networks/resnet.md)一节介绍的ResNet-18模型。我们还将应用[“多GPU计算的简洁实现”](../chapter_computational-performance/multiple-gpus-gluon.md)一节中介绍的方法，使用多GPU训练模型。
+
+首先，我们定义`try_all_gpus`函数，从而能够获取所有可用的GPU。
+
+```{.python .input  n=35}
+def try_all_gpus():  # 本函数已保存在d2lzh包中方便以后使用
+    ctxes = []
+    try:
+        for i in range(16):  # 假设一台机器上GPU的数量不超过16
+            ctx = mx.gpu(i)
+            _ = nd.array([0], ctx=ctx)
+            ctxes.append(ctx)
+    except mx.base.MXNetError:
+        pass
+    if not ctxes:
+        ctxes = [mx.cpu()]
+    return ctxes
+```
+
+下面定义的辅助函数`_get_batch`将小批量数据样本`batch`划分并复制到`ctx`变量所指定的各个显存上。
+
+```{.python .input}
+def _get_batch(batch, ctx):
+    features, labels = batch
+    if labels.dtype != features.dtype:
+        labels = labels.astype(features.dtype)
+    return (gutils.split_and_load(features, ctx),
+            gutils.split_and_load(labels, ctx), features.shape[0])
+```
+
+然后，我们定义`evaluate_accuracy`函数评价模型的分类准确率。与[“softmax回归的从零开始实现”](../chapter_deep-learning-basics/softmax-regression-scratch.md)和[“卷积神经网络（LeNet）”](../chapter_convolutional-neural-networks/lenet.md)两节中描述的`evaluate_accuracy`函数不同，这里定义的函数更加通用：它通过辅助函数`_get_batch`使用`ctx`变量所包含的所有GPU来评价模型。
+
+```{.python .input  n=36}
+# 本函数已保存在d2lzh包中方便以后使用
+def evaluate_accuracy(data_iter, net, ctx=[mx.cpu()]):
+    if isinstance(ctx, mx.Context):
+        ctx = [ctx]
+    acc_sum, n = nd.array([0]), 0
+    for batch in data_iter:
+        features, labels, _ = _get_batch(batch, ctx)
+        for X, y in zip(features, labels):
+            y = y.astype('float32')
+            acc_sum += (net(X).argmax(axis=1) == y).sum().copyto(mx.cpu())
+            n += y.size
+        acc_sum.wait_to_read()
+    return acc_sum.asscalar() / n
+```
+
+接下来，我们定义`train`函数使用多GPU训练并评价模型。
+
+```{.python .input  n=37}
+# 本函数已保存在d2lzh包中方便以后使用
+def train(train_iter, test_iter, net, loss, trainer, ctx, num_epochs):
+    print('training on', ctx)
+    if isinstance(ctx, mx.Context):
+        ctx = [ctx]
+    for epoch in range(num_epochs):
+        train_l_sum, train_acc_sum, n, m, start = 0.0, 0.0, 0, 0, time.time()
+        for i, batch in enumerate(train_iter):
+            Xs, ys, batch_size = _get_batch(batch, ctx)
+            ls = []
+            with autograd.record():
+                y_hats = [net(X) for X in Xs]
+                ls = [loss(y_hat, y) for y_hat, y in zip(y_hats, ys)]
+            for l in ls:
+                l.backward()
+            trainer.step(batch_size)
+            train_l_sum += sum([l.sum().asscalar() for l in ls])
+            n += sum([l.size for l in ls])
+            train_acc_sum += sum([(y_hat.argmax(axis=1) == y).sum().asscalar()
+                                 for y_hat, y in zip(y_hats, ys)])
+            m += sum([y.size for y in ys])
+        test_acc = evaluate_accuracy(test_iter, net, ctx)
+        print('epoch %d, loss %.4f, train acc %.3f, test acc %.3f, '
+              'time %.1f sec'
+              % (epoch + 1, train_l_sum / n, train_acc_sum / m, test_acc,
+                 time.time() - start))
+```
+
+现在就可以定义`train_with_data_aug`函数使用图像增广来训练模型了。该函数获取了所有可用的GPU，并将Adam算法作为训练使用的优化算法，然后将图像增广应用于训练数据集之上，最后调用刚才定义的`train`函数训练并评价模型。
+
+```{.python .input  n=38}
+def train_with_data_aug(train_augs, test_augs, lr=0.001):
+    batch_size, ctx, net = 256, try_all_gpus(), d2l.resnet18(10)
+    net.initialize(ctx=ctx, init=init.Xavier())
+    trainer = gluon.Trainer(net.collect_params(), 'adam',
+                            {'learning_rate': lr})
+    loss = gloss.SoftmaxCrossEntropyLoss()
+    train_iter = load_cifar10(True, train_augs, batch_size)
+    test_iter = load_cifar10(False, test_augs, batch_size)
+    train(train_iter, test_iter, net, loss, trainer, ctx, num_epochs=10)
+```
+
+下面使用随机左右翻转的图像增广来训练模型。
+
+```{.python .input  n=39}
+train_with_data_aug(flip_aug, no_aug)
+```
+
+## 小结
+
+* 图像增广基于现有训练数据生成随机图像从而应对过拟合。
+* 为了在预测时得到确定的结果，通常只将图像增广应用在训练样本上，而不在预测时使用含随机操作的图像增广。
+* 可以从Gluon的`transforms`模块中获取有关图片增广的类。
+