use adam in neural style transfer

cf230d49 · Aston Zhang · 5e69d3bb · cf230d49 · 5e69d3bb · cf230d49
Showing with 39 addition and 28 deletion

chapter_computer-vision/neural-style.md chapter_computer-vision/neural-style.md +39 -28

img/neural-style-1.png img/neural-style-1.png +0 -0

img/neural-style-2.png img/neural-style-2.png +0 -0

未找到文件。
--- a/chapter_computer-vision/neural-style.md
+++ b/chapter_computer-vision/neural-style.md
@@ -11,13 +11,13 @@

 ![使用神经网络进行样式迁移。](../img/neural-style.svg)

-```{.python .input}
+```{.python .input  n=1}
 import sys
 sys.path.insert(0, '..')

 %matplotlib inline
 import gluonbook as gb
-from mxnet import autograd, gluon, image, nd
+from mxnet import autograd, gluon, image, init, nd
 from mxnet.gluon import model_zoo, nn
 import time
 ```
@@ -26,20 +26,20 @@ import time

 我们分别读取样式和内容图像。

-```{.python .input  n=9}
+```{.python .input  n=2}
 gb.set_figsize()
 style_img = image.imread('../img/autumn_oak.jpg')
 gb.plt.imshow(style_img.asnumpy());
 ```

-```{.python .input  n=8}
+```{.python .input  n=3}
 content_img = image.imread('../img/rainier.jpg')
 gb.plt.imshow(content_img.asnumpy());
 ```

 然后定义预处理和后处理函数。预处理函数将原始图像进行归一化并转换成卷积网络接受的输入格式，后处理函数则还原成能展示的图像格式。

-```{.python .input  n=2}
+```{.python .input  n=4}
 rgb_mean = nd.array([0.485, 0.456, 0.406])
 rgb_std = nd.array([0.229, 0.224, 0.225])

@@ -57,13 +57,13 @@ def postprocess(img):

 我们使用原论文使用的VGG 19模型，并下载在Imagenet上训练好的权重 [1]。

-```{.python .input  n=3}
+```{.python .input  n=5}
 pretrained_net = model_zoo.vision.vgg19(pretrained=True)
 ```

 我们知道VGG使用了五个卷积块来构建网络，块之间使用最大池化层来做间隔（参考[“使用重复元素的网络（VGG）”](../chapter_convolutional-neural-networks/vgg.md)小节）。原论文中使用每个卷积块的第一个卷积层输出来匹配样式（称之为样式层），和第四块中的最后一个卷积层来匹配内容（称之为内容层）[1]。我们可以打印`pretrained_net`来获取这些层的具体位置。

-```{.python .input  n=11}
+```{.python .input  n=6}
 style_layers, content_layers = [0, 5, 10, 19, 28], [25]
 ```

@@ -71,7 +71,7 @@ style_layers, content_layers = [0, 5, 10, 19, 28], [25]

 下面构建一个新的网络使其只保留我们需要预留的层。

-```{.python .input  n=13}
+```{.python .input  n=7}
 net = nn.Sequential()
 for i in range(max(content_layers + style_layers) + 1):
    net.add(pretrained_net.features[i])
@@ -79,7 +79,7 @@ for i in range(max(content_layers + style_layers) + 1):

 给定输入`x`，简单使用`net(x)`只能拿到最后的输出，而这里我们还需要中间层输出。因此我们我们逐层计算，并保留样式层和内容层的输出。

-```{.python .input  n=14}
+```{.python .input  n=8}
 def extract_features(x, content_layers, style_layers):
    contents = []
    styles = []
@@ -94,7 +94,7 @@ def extract_features(x, content_layers, style_layers):

 最后我们定义函数分别对内容图像和样式图像抽取对应的特征。因为在训练时我们不修改网络的权重，所以我们可以在训练开始之前提取出所要的特征。

-```{.python .input}
+```{.python .input  n=9}
 def get_contents(image_shape, ctx):
    content_x = preprocess(content_img, image_shape).copyto(ctx)
    content_y, _ = extract_features(content_x, content_layers, style_layers)
@@ -110,7 +110,7 @@ def get_styles(image_shape, ctx):

 在训练时，我们需要定义如何比较合成图像和内容图像的内容层输出（内容损失函数），以及比较和样式图像的样式层输出（样式损失函数）。内容损失函数可以使用回归用的均方误差。

-```{.python .input}
+```{.python .input  n=10}
 def content_loss(y_hat, y):
    return (y_hat - y).square().mean()
 ```
@@ -128,7 +128,7 @@ def gram(x):

 和对应的损失函数，这里假设样式图像的样式特征协方差已经预先计算好了。

-```{.python .input}
+```{.python .input  n=12}
 def style_loss(y_hat, gram_y):
    return (gram(y_hat) - gram_y).square().mean()
 ```
@@ -137,7 +137,7 @@ def style_loss(y_hat, gram_y):

 $$\sum_{i,j} \left|x_{i,j} - x_{i+1,j}\right| + \left|x_{i,j} - x_{i,j+1}\right|.$$

-```{.python .input}
+```{.python .input  n=13}
 def tv_loss(y_hat):
    return 0.5 * ((y_hat[:, :, 1:, :] - y_hat[:, :, :-1, :]).abs().mean() +
                  (y_hat[:, :, :, 1:] - y_hat[:, :, :, :-1]).abs().mean())
@@ -145,9 +145,9 @@ def tv_loss(y_hat):

 训练中我们将上述三个损失函数加权求和。通过调整权重值我们可以控制学到的图像是否保留更多样式，更多内容，还是更加干净。此外注意到样式层里有五个神经层，我们对靠近输入的有较少的通道数的层给予比较大的权重。

-```{.python .input  n=12}
+```{.python .input  n=14}
 style_channels = [net[l].weight.shape[0] for l in style_layers]
-style_weights = [1e4] * len(style_channels)
+style_weights = [1e3] * len(style_channels)
 content_weights, tv_weight = [1], 10
 ```

@@ -155,10 +155,23 @@ content_weights, tv_weight = [1], 10

 这里的训练跟前面章节的主要不同在于我们只对输入`x`进行更新。此外我们将`x`的梯度除以了它的绝对平均值来降低对学习率的敏感度，而且每隔一定的批量我们减小一次学习率。

-```{.python .input  n=18}
+```{.python .input  n=15}
+class TransferredImage(nn.Block):
+    def __init__(self, img_shape, **kwargs):
+        super(TransferredImage, self).__init__(**kwargs)
+        self.weight = self.params.get('weight', shape=img_shape)
+
+    def forward(self):
+        return self.weight.data()
+
+
 def train(x, content_y, style_y, ctx, lr, max_epochs, lr_decay_epoch):
-    x = x.as_in_context(ctx)
-    x.attach_grad()
+    net = TransferredImage(x.shape)
+    net.initialize(init.Constant(x), ctx=ctx, force_reinit=True)
+    trainer = gluon.Trainer(net.collect_params(), 'adam',
+                            {'learning_rate': lr})
+    x = net()
+
    style_y_gram = [gram(y) for y in style_y]
    for i in range(max_epochs):
        tic = time.time()
@@ -174,10 +187,8 @@ def train(x, content_y, style_y, ctx, lr, max_epochs, lr_decay_epoch):
            tv_L = tv_weight * tv_loss(x)
            # 对所有损失求和。
            l = nd.add_n(*style_L) + nd.add_n(*content_L) + tv_L
-        l.backward()
-        # 对 x 的梯度除去绝对均值使得数值更加稳定，并更新 x。
-        x.grad[:] /= x.grad.abs().mean() + 1e-8
-        x[:] -= lr * x.grad
+        l.backward()     
+        trainer.step(1)
        # 如果不加的话会导致每50轮迭代才同步一次，可能导致过大内存使用。
        nd.waitall()

@@ -188,14 +199,14 @@ def train(x, content_y, style_y, ctx, lr, max_epochs, lr_decay_epoch):
                     nd.add_n(*style_L).asscalar(), tv_L.asscalar(),
                     time.time() - tic))
        if i % lr_decay_epoch == 0:
-            lr *= 0.1
-            print('change lr to %.1e' % lr)
-    return x
+            trainer.set_learning_rate(trainer.learning_rate * 0.1)
+            print('change lr to %.1e' % trainer.learning_rate)
+    return net()
 ```

 现在我们可以真正开始训练了。首先我们将图像调整到高为300宽200来进行训练，这样使得训练更加快速。合成图像的初始值设成了内容图像，使得初始值能尽可能接近训练输出来加速收敛。

-```{.python .input  n=19}
+```{.python .input  n=16}
 ctx, image_shape = gb.try_gpu(), (300, 200)
 net.collect_params().reset_ctx(ctx)
 content_x, content_y = get_contents(image_shape, ctx)
@@ -207,7 +218,7 @@ y = train(x, content_y, style_y, ctx, 0.1, 500, 200)

 因为使用了内容图像作为初始值，所以一开始内容误差远小于样式误差。随着迭代的进行样式误差迅速减少，最终它们值在相近的范围。下面我们将训练好的合成图像保存下来。

-```{.python .input}
+```{.python .input  n=17}
 gb.plt.imsave('../img/neural-style-1.png', postprocess(y).asnumpy())
 ```

@@ -215,7 +226,7 @@ gb.plt.imsave('../img/neural-style-1.png', postprocess(y).asnumpy())

 可以看到图9.13中的合成图像保留了样式图像的风景物体，同时借鉴了样式图像的色彩。由于图像尺寸较小，所以细节上比较模糊。下面我们在更大的$1200 \times 800$的尺寸上训练，希望可以得到更加清晰的合成图像。为了加速收敛，我们将训练到的合成图像高宽放大3倍来作为初始值。

-```{.python .input  n=20}
+```{.python .input  n=18}
 image_shape = (1200, 800)

 content_x, content_y = get_contents(image_shape, ctx)

--- a/img/neural-style-1.png
+++ b/img/neural-style-1.png
--- a/img/neural-style-2.png
+++ b/img/neural-style-2.png