diff --git a/build/build.yml b/build/build.yml
index b8d256884d632f642995b6a2413f25fd122d534c..e7772de1ac708e3f4b75d8b160d8538b753fde93 100644
--- a/build/build.yml
+++ b/build/build.yml
@@ -13,5 +13,5 @@ dependencies:
   - recommonmark==0.4.0
   - https://github.com/mli/notedown/tarball/master
   - mxnet-cu80==1.2.1
-  - gluonbook==0.7.1
+  - gluonbook==0.7.2
   - jieba==0.39
diff --git a/chapter_computational-performance/multiple-gpus-gluon.md b/chapter_computational-performance/multiple-gpus-gluon.md
index c846f9529d9cb593570c03f3401828064e6b1db1..73c43381658a6bbb0123d9c595967e6c85942403 100644
--- a/chapter_computational-performance/multiple-gpus-gluon.md
+++ b/chapter_computational-performance/multiple-gpus-gluon.md
@@ -4,7 +4,7 @@
 
 先导入本节实验需要的包或模块。同上一节，运行本节中的程序需要至少两块GPU。
 
-```{.python .input}
+```{.python .input  n=1}
 import gluonbook as gb
 import mxnet as mx
 from mxnet import autograd, gluon, init, nd
@@ -16,7 +16,7 @@ from time import time
 
 我们使用ResNet-18来作为本节的样例模型。我们将`resnet18`函数定义在`gluonbook`包中供后面章节调用。
 
-```{.python .input  n=1}
+```{.python .input  n=2}
 def resnet18(num_classes):
     net = nn.Sequential()
     net.add(nn.Conv2D(64, kernel_size=3, strides=1, padding=1),
@@ -45,14 +45,14 @@ net = resnet18(10)
 
 之前我们介绍了如何使用`initialize`函数的`ctx`参数在CPU或单个GPU上初始化模型参数。事实上，`ctx`可以接受一系列的CPU/GPU，从而使初始化好的模型参数复制到`ctx`里所有的CPU/GPU上。
 
-```{.python .input}
+```{.python .input  n=3}
 ctx = [mx.gpu(0), mx.gpu(1)]
 net.initialize(init=init.Normal(sigma=0.01), ctx=ctx)
 ```
 
 Gluon提供了上一节中实现的`split_and_load`函数。它可以划分一个小批量的数据样本并复制到各个CPU/GPU上。之后，根据输入数据所在的CPU/GPU，模型计算会发生在相同的CPU/GPU上。
 
-```{.python .input}
+```{.python .input  n=4}
 x = nd.random.uniform(shape=(4, 1, 28, 28))
 gpu_x = gutils.split_and_load(x, ctx)
 net(gpu_x[0]), net(gpu_x[1])
@@ -60,7 +60,7 @@ net(gpu_x[0]), net(gpu_x[1])
 
 回忆一下[“模型参数的延后初始化”](../chapter_deep-learning-computation/deferred-init.md)一节中介绍的延后的初始化。现在，我们可以通过`data`访问初始化好的模型参数值了。需要注意的是，默认下`weight.data()`会返回CPU上的参数值。由于我们指定了2个GPU来初始化模型参数，我们需要指定GPU访问。我们看到，相同参数在不同的GPU上的值一样。
 
-```{.python .input}
+```{.python .input  n=5}
 weight = net[0].params.get('weight')
 try:
     weight.data()
@@ -73,7 +73,7 @@ weight.data(ctx[0])[0], weight.data(ctx[1])[0]
 
 我们先定义交叉熵损失函数。
 
-```{.python .input}
+```{.python .input  n=6}
 loss = gloss.SoftmaxCrossEntropyLoss()
 ```
 
@@ -97,7 +97,8 @@ def train(num_gpus, batch_size, lr):
                     gpu_Xs, gpu_ys)]
             for l in ls:
                 l.backward()
-            trainer.step(batch_size)
+            # 每个 GPU 上批量大小为总批量大小与 GPU 数量之比。
+            trainer.step(batch_size / len(ctx))
         nd.waitall()
         print('epoch %d, training time: %.1f sec' % (epoch, time() - start))
         test_acc = gb.evaluate_accuracy(test_iter, net, ctx[0])
@@ -106,7 +107,7 @@ def train(num_gpus, batch_size, lr):
 
 我们在2个GPU上训练模型。
 
-```{.python .input}
+```{.python .input  n=8}
 train(num_gpus=2, batch_size=512, lr=0.3)
 ```
 
diff --git a/chapter_computational-performance/multiple-gpus.md b/chapter_computational-performance/multiple-gpus.md
index 78bbe3deb08132ed61d847c18a88d4ac831ecaa4..a3fac99f7e559b06f00124fa12c894b2ac75bfd4 100644
--- a/chapter_computational-performance/multiple-gpus.md
+++ b/chapter_computational-performance/multiple-gpus.md
@@ -128,7 +128,7 @@ print('output:', splitted)
 
 ## 单个小批量上的多GPU训练
 
-现在我们可以实现单个小批量上的多GPU训练了。它的实现主要依据本节介绍的数据并行方法。我们将使用刚刚定义的多GPU之间同步数据的辅助函数，例如`split_and_load`和`allreduce`。
+现在我们可以实现单个小批量上的多GPU训练了。它的实现主要依据本节介绍的数据并行方法。我们将使用刚刚定义的多GPU之间同步数据的辅助函数，例如`split_and_load`和`allreduce`。这里需要注意的是，每个GPU上的批量大小应该为总批量大小与GPU数量之比。
 
 ```{.python .input  n=6}
 def train_batch(X, y, gpu_params, ctx, lr):
@@ -145,9 +145,10 @@ def train_batch(X, y, gpu_params, ctx, lr):
     # 把各个 GPU 上的梯度加起来，然后再广播到所有 GPU 上。
     for i in range(len(gpu_params[0])):
         allreduce([gpu_params[c][i].grad for c in range(len(ctx))])
-    # 在各个 GPU 上更新自己维护的那一份完整的模型参数。
+    # 在各个 GPU 上更新自己维护的那一份完整的模型参数。每个 GPU 上批量大小为总批量大小
+    # X.shape[0] 与 GPU 数量 len(ctx) 之比。
     for param in gpu_params:
-        gb.sgd(param, lr, X.shape[0])
+        gb.sgd(param, lr, X.shape[0] / len(ctx))
 ```
 
 ## 训练函数
@@ -177,13 +178,13 @@ def train(num_gpus, batch_size, lr):
 我们先使用一个GPU来训练。
 
 ```{.python .input  n=8}
-train(num_gpus=1, batch_size=256, lr=0.3)
+train(num_gpus=1, batch_size=256, lr=0.2)
 ```
 
 接下来，我们先使用2个GPU来训练。我们将批量大小也增加一倍，以使得GPU的计算资源能够得到较充分利用。
 
 ```{.python .input  n=10}
-train(num_gpus=2, batch_size=512, lr=0.3)
+train(num_gpus=2, batch_size=512, lr=0.2)
 ```
 
 由于批量大小增加了一倍，每个迭代周期的迭代次数减小了一半。因此，我们观察到每个迭代周期的耗时比单GPU训练时少了近一半。但由于总体迭代次数的减少，模型在验证数据集上的精度略有下降。这很可能是由于训练不够充分造成的。因此，多GPU训练时，我们可以适当增加迭代周期使训练较充分。
diff --git a/chapter_computer-vision/fcn.md b/chapter_computer-vision/fcn.md
index 29a0a72d252c3830df90cdcaa3ce8ea18abf7e7c..a7c661d2ca2016df8ef8f8dc0aabc58563e4121d 100644
--- a/chapter_computer-vision/fcn.md
+++ b/chapter_computer-vision/fcn.md
@@ -4,7 +4,7 @@
 
 但在语义分割里，我们需要对每个像素预测类别，也就是需要输出形状需要是$1000\times 224\times 224$。如果仍然使用全连接层作为输出，那么这一层权重将多达数百GB。本小节我们将介绍利用卷积神经网络解决语义分割的一个开创性工作之一：全卷积网络（fully convolutional network，简称FCN）[1]。FCN里将最后的全连接层修改称转置卷积层（transposed convolution）来得到所需大小的输出。
 
-```{.python .input  n=1}
+```{.python .input  n=2}
 %matplotlib inline
 import gluonbook as gb
 from mxnet import gluon, init, nd, image
@@ -19,7 +19,7 @@ import sys
 
 下面我们构造一个卷积层并打印它的输出形状。
 
-```{.python .input  n=2}
+```{.python .input  n=3}
 conv = nn.Conv2D(10, kernel_size=4, padding=1, strides=2)
 conv.initialize()
 
@@ -30,7 +30,7 @@ y.shape
 
 使用用样的卷积窗、填充和步幅的转置卷积层，我们可以得到和`x`一样的输出。
 
-```{.python .input  n=3}
+```{.python .input  n=4}
 conv_trans = nn.Conv2DTranspose(3, kernel_size=4, padding=1, strides=2)
 conv_trans.initialize()
 conv_trans(y).shape
@@ -46,14 +46,14 @@ FCN的核心思想是将一个卷积网络的最后全连接输出层替换成
 
 下面我们基于ResNet 18来创建FCN。首先我们下载一个预先训练好的模型，并打印其最后的数个神经层。
 
-```{.python .input  n=4}
+```{.python .input  n=5}
 pretrained_net = model_zoo.vision.resnet18_v2(pretrained=True)
 pretrained_net.features[-4:], pretrained_net.output
 ```
 
 可以看到`feature`模块最后两层是`GlobalAvgPool2D`和`Flatten`，在FCN里均不需要，`output`模块里的全连接层也需要舍去。下面我们定义一个新的网络，它复制除了`feature`里除去最后两层的所有神经层以及权重。
 
-```{.python .input  n=5}
+```{.python .input  n=6}
 net = nn.HybridSequential()
 for layer in pretrained_net.features[:-2]:
     net.add(layer)
@@ -61,14 +61,14 @@ for layer in pretrained_net.features[:-2]:
 
 给定高宽为224的输入，`net`的输出将输入高宽减少了32倍。
 
-```{.python .input  n=6}
+```{.python .input  n=7}
 x = nd.random.uniform(shape=(1, 3, 224, 224))
 net(x).shape
 ```
 
 为了是的输出跟输入有同样的高宽，我们构建一个步幅为32的转置卷积层，卷积核的窗口高宽设置成步幅的2倍，并补充适当的填充。在转置卷积层之前，我们加上$1\times 1$卷积层来将通道数从512降到标注类别数，对Pascal VOC数据集来说是21。
 
-```{.python .input  n=7}
+```{.python .input  n=8}
 num_classes = 21
 
 net.add(
@@ -81,7 +81,7 @@ net.add(
 
 模型`net`中的最后两层需要对权重进行初始化，通常我们会使用随机初始化。但新加入的转置卷积层的功能有些类似于将输入调整到更大的尺寸。在图片处理里面，我们可以通过有适当卷积核的卷积运算符来完成这个操作。常用的包括双线性差值核，下面函数构造核权重。
 
-```{.python .input  n=8}
+```{.python .input  n=9}
 def bilinear_kernel(in_channels, out_channels, kernel_size):
     factor = (kernel_size + 1) // 2
     if kernel_size % 2 == 1:
@@ -100,14 +100,14 @@ def bilinear_kernel(in_channels, out_channels, kernel_size):
 
 接下来我们构造一个步幅为2的转置卷积层，将其权重初始化成双线性差值核。
 
-```{.python .input  n=9}
+```{.python .input  n=10}
 conv_trans = nn.Conv2DTranspose(3, kernel_size=4, padding=1, strides=2)
 conv_trans.initialize(init.Constant(bilinear_kernel(3, 3, 4)))
 ```
 
 可以看到这个转置卷积层的前向函数的效果是将输入图片高宽扩大2倍。
 
-```{.python .input  n=10}
+```{.python .input  n=11}
 gb.set_figsize()
 img = image.imread('../img/catdog.jpg')
 print('input', img.shape)
@@ -120,7 +120,7 @@ gb.plt.imshow(y.asnumpy());
 
 下面对`net`的最后两层进行初始化。其中$1\times 1$卷积层使用Xavier，转置卷积层则使用双线性差值核。
 
-```{.python .input  n=11}
+```{.python .input  n=12}
 trans_conv_weights = bilinear_kernel(num_classes, num_classes, 64)
 net[-1].initialize(init.Constant(trans_conv_weights))
 net[-2].initialize(init=init.Xavier())
@@ -130,7 +130,7 @@ net[-2].initialize(init=init.Xavier())
 
 我们使用较大的输入图片尺寸，其值选成了32的倍数。数据的读取方法已在上一节描述。
 
-```{.python .input}
+```{.python .input  n=13}
 input_shape = (320, 480)
 batch_size = 32
 colormap2label = nd.zeros(256**3)
@@ -156,7 +156,7 @@ ctx = gb.try_all_gpus()
 loss = gloss.SoftmaxCrossEntropyLoss(axis=1)
 net.collect_params().reset_ctx(ctx)
 trainer = gluon.Trainer(net.collect_params(), 'sgd',
-                        {'learning_rate': 0.1, 'wd': 1e-3})
+                        {'learning_rate': 0.02, 'wd': 1e-3})
 gb.train(train_iter, test_iter, net, loss, trainer, ctx, num_epochs=5)
 ```
 
@@ -167,7 +167,7 @@ gb.train(train_iter, test_iter, net, loss, trainer, ctx, num_epochs=5)
 ```{.python .input  n=13}
 def predict(im):
     data = test_iter._dataset.normalize_image(im)
-    data = data.transpose((2,0,1)).expand_dims(axis=0)
+    data = data.transpose((2, 0, 1)).expand_dims(axis=0)
     yhat = net(data.as_in_context(ctx[0]))
     pred = nd.argmax(yhat, axis=1)
     return pred.reshape((pred.shape[1], pred.shape[2]))
diff --git a/chapter_computer-vision/image-augmentation.md b/chapter_computer-vision/image-augmentation.md
index 5ad0cbb8405d5669a2a4f38e95c5abb6c8bc2458..22fc0b3c1d72d614c072c9df72413052a3e369bc 100644
--- a/chapter_computer-vision/image-augmentation.md
+++ b/chapter_computer-vision/image-augmentation.md
@@ -216,7 +216,7 @@ def train(train_iter, test_iter, net, loss, trainer, ctx, num_epochs):
 现在，我们可以定义函数使用图片增广来训练模型了。
 
 ```{.python .input  n=38}
-def train_with_data_aug(train_augs, test_augs, lr=0.01):
+def train_with_data_aug(train_augs, test_augs, lr=0.005):
     batch_size = 256
     ctx = try_all_gpus()
     net = gb.resnet18(10)
@@ -235,7 +235,7 @@ def train_with_data_aug(train_augs, test_augs, lr=0.01):
 train_with_data_aug(train_augs, test_augs)
 ```
 
-作为对比，我们尝试只对训练数据做中间剪裁。
+作为对比，下面我们尝试不使用图片增广。
 
 ```{.python .input  n=40}
 train_with_data_aug(test_augs, test_augs)
diff --git a/chapter_natural-language-processing/sentiment-analysis-cnn.md b/chapter_natural-language-processing/sentiment-analysis-cnn.md
index 43655f4c3faac64e4c8832e7308f9a7b08f084b6..d013fd920824129ae6a915e68d7f292e86a85294 100644
--- a/chapter_natural-language-processing/sentiment-analysis-cnn.md
+++ b/chapter_natural-language-processing/sentiment-analysis-cnn.md
@@ -283,7 +283,7 @@ class TextCNN(nn.Block):
 ```{.python .input  n=11}
 num_outputs = 2
 lr = 0.001
-num_epochs = 1
+num_epochs = 5
 batch_size = 64
 embed_size = 100
 ngram_kernel_sizes = [3, 4, 5]
@@ -318,6 +318,20 @@ test_loader = gdata.DataLoader(test_set, batch_size=batch_size, shuffle=False)
 gb.train(train_loader, test_loader, net, loss, trainer, ctx, num_epochs)
 ```
 
+```{.python .input}
+review = ['this', 'movie', 'is', 'just', 'great']
+nd.argmax(net(nd.reshape(
+    nd.array([vocab.token_to_idx[token] for token in review], ctx=gb.try_gpu()), 
+    shape=(1, -1))), axis=1).asscalar()
+```
+
+```{.python .input}
+review = ['this', 'movie', 'is', 'terribly', 'boring']
+nd.argmax(net(nd.reshape(
+    nd.array([vocab.token_to_idx[token] for token in review], ctx=gb.try_gpu()), 
+    shape=(1, -1))), axis=1).asscalar()
+```
+
 ## 小结
 
 * 我们可以使用一维卷积来处理时序序列任务，如自然语言处理。
diff --git a/chapter_natural-language-processing/sentiment-analysis.md b/chapter_natural-language-processing/sentiment-analysis.md
index 216f320e8c766c2bb35c4d2870caff6685e95efc..1dbcb0e266ded37d21522c112f06b439fa4a27a9 100644
--- a/chapter_natural-language-processing/sentiment-analysis.md
+++ b/chapter_natural-language-processing/sentiment-analysis.md
@@ -181,8 +181,8 @@ class SentimentNet(nn.Block):
 
 ```{.python .input  n=11}
 num_outputs = 2
-lr = 0.1
-num_epochs = 1
+lr = 0.5
+num_epochs = 5
 batch_size = 64
 embed_size = 100
 num_hiddens = 100
@@ -220,7 +220,14 @@ gb.train(train_loader, test_loader, net, loss, trainer, ctx, num_epochs)
 下面我们试着分析一个简单的句子的情感（1和0分别代表正面和负面）。为了在更复杂的句子上得到较准确的分类，我们需要使用完整数据集训练模型，并适当增大训练周期。
 
 ```{.python .input  n=18}
-review = ['this', 'movie', 'is', 'great']
+review = ['this', 'movie', 'is', 'just', 'great']
+nd.argmax(net(nd.reshape(
+    nd.array([vocab.token_to_idx[token] for token in review], ctx=gb.try_gpu()), 
+    shape=(1, -1))), axis=1).asscalar()
+```
+
+```{.python .input}
+review = ['this', 'movie', 'is', 'terribly', 'boring']
 nd.argmax(net(nd.reshape(
     nd.array([vocab.token_to_idx[token] for token in review], ctx=gb.try_gpu()), 
     shape=(1, -1))), axis=1).asscalar()
diff --git a/environment.yml b/environment.yml
index 2a82a30b68239b41f16fb9989d170b10a335e6b2..afa225fe7171834f08c17ea190e82292d9039564 100644
--- a/environment.yml
+++ b/environment.yml
@@ -7,4 +7,4 @@ dependencies:
 - pip:
   - requests==2.18.4
   - mxnet==1.2.1
-  - gluonbook==0.7.1
+  - gluonbook==0.7.2
diff --git a/gluonbook/__init__.py b/gluonbook/__init__.py
index 3a0031bcb91bb96728da70b3a435a615e39a676b..e4ab92712fec7a639c25128298d6d157c05c52de 100644
--- a/gluonbook/__init__.py
+++ b/gluonbook/__init__.py
@@ -1,4 +1,4 @@
 
 from .utils import *
 
-__version__ = '0.7.1'
+__version__ = '0.7.2'
diff --git a/gluonbook/utils.py b/gluonbook/utils.py
index dd85f14bc9b9ab0de4c96711829f1f0c125c5b6f..4bb3a6ef4cff2c109c075cad1818163f50bff6c9 100644
--- a/gluonbook/utils.py
+++ b/gluonbook/utils.py
@@ -387,7 +387,7 @@ def train(train_iter, test_iter, net, loss, trainer, ctx, num_epochs):
             train_acc_sum += sum([(y_hat.argmax(axis=1) == y).sum().asscalar()
                                  for y_hat, y in zip(y_hats, ys)])
             train_l_sum += sum([l.sum().asscalar() for l in ls])
-            trainer.step(batch_size)
+            trainer.step(batch_size / len(ctx))
             n += batch_size
             m += sum([y.size for y in ys])
         test_acc = evaluate_accuracy(test_iter, net, ctx)