revise opt and utils

cab512c5 · Aston Zhang · 7617b133 · cab512c5 · cab512c5 · cab512c5
15 changed file
--- a/chapter_optimization/adadelta-gluon.md
+++ b/chapter_optimization/adadelta-gluon.md
@@ -8,9 +8,8 @@
 %config InlineBackend.figure_format = 'retina'
 %matplotlib inline
 import mxnet as mx
-from mxnet import autograd, gluon, nd
+from mxnet import gluon, nd
 from mxnet.gluon import nn
-import numpy as np
 import sys
 sys.path.append('..')
 import utils
@@ -24,9 +23,9 @@ num_inputs = 2
 num_examples = 1000
 true_w = [2, -3.4]
 true_b = 4.2
-X = nd.random.normal(scale=1, shape=(num_examples, num_inputs))
-y = true_w[0] * X[:, 0] + true_w[1] * X[:, 1] + true_b
-y += nd.random.normal(scale=0.01, shape=y.shape)
+features = nd.random.normal(scale=1, shape=(num_examples, num_inputs))
+labels = true_w[0] * features[:, 0] + true_w[1] * features[:, 1] + true_b
+labels += nd.random.normal(scale=0.01, shape=labels.shape)

 # 线性回归模型。
 net = nn.Sequential()
@@ -39,7 +38,7 @@ net.add(nn.Dense(1))
 net.initialize(mx.init.Normal(sigma=1), force_reinit=True)
 trainer = gluon.Trainer(net.collect_params(), 'adadelta', {'rho': 0.9999})
 utils.optimize(batch_size=10, trainer=trainer, num_epochs=3, decay_epoch=None,
-               log_interval=10, X=X, y=y, net=net)
+               log_interval=10, features=features, labels=labels, net=net)
 ```

 ## 小结

--- a/chapter_optimization/adadelta-scratch.md
+++ b/chapter_optimization/adadelta-scratch.md
@@ -66,9 +66,9 @@ num_inputs = 2
 num_examples = 1000
 true_w = [2, -3.4]
 true_b = 4.2
-X = nd.random.normal(scale=1, shape=(num_examples, num_inputs))
-y = true_w[0] * X[:, 0] + true_w[1] * X[:, 1] + true_b
-y += nd.random.normal(scale=0.01, shape=y.shape)
+features = nd.random.normal(scale=1, shape=(num_examples, num_inputs))
+labels = true_w[0] * features[:, 0] + true_w[1] * features[:, 1] + true_b
+labels += nd.random.normal(scale=0.01, shape=labels.shape)

 # 初始化模型参数。
 def init_params():
@@ -89,24 +89,23 @@ def init_params():

 ```{.python .input  n=2}
 net = utils.linreg
-squared_loss = utils.squared_loss
+loss = utils.squared_loss

 def optimize(batch_size, rho, num_epochs, log_interval):
    [w, b], sqrs, deltas = init_params()
-    y_vals = [squared_loss(net(X, w, b), y).mean().asnumpy()]
+    ls = [loss(net(features, w, b), labels).mean().asnumpy()]
    for epoch in range(1, num_epochs + 1):
-        for batch_i, (features, label) in enumerate(
-            utils.data_iter(batch_size, num_examples, X, y)):
+        for batch_i, (X, y) in enumerate(
+            utils.data_iter(batch_size, num_examples, features, labels)):
            with autograd.record():
-                output = net(features, w, b)
-                loss = squared_loss(output, label)
-            loss.backward()
+                l = loss(net(X, w, b), y)
+            l.backward()
            adadelta([w, b], sqrs, deltas, rho, batch_size)
            if batch_i * batch_size % log_interval == 0:
-                y_vals.append(squared_loss(net(X, w, b), y).mean().asnumpy())
+                ls.append(loss(net(features, w, b), labels).mean().asnumpy())
    print('w:', w, '\nb:', b, '\n')
-    x_vals = np.linspace(0, num_epochs, len(y_vals), endpoint=True)
-    utils.semilogy(x_vals, y_vals, 'epoch', 'loss')
+    es = np.linspace(0, num_epochs, len(ls), endpoint=True)
+    utils.semilogy(es, ls, 'epoch', 'loss')
 ```

 最终，优化所得的模型参数值与它们的真实值较接近。

--- a/chapter_optimization/adagrad-gluon.md
+++ b/chapter_optimization/adagrad-gluon.md
@@ -9,9 +9,8 @@
 %config InlineBackend.figure_format = 'retina'
 %matplotlib inline
 import mxnet as mx
-from mxnet import autograd, gluon, nd
+from mxnet import gluon, nd
 from mxnet.gluon import nn
-import numpy as np
 import sys
 sys.path.append('..')
 import utils
@@ -25,9 +24,9 @@ num_inputs = 2
 num_examples = 1000
 true_w = [2, -3.4]
 true_b = 4.2
-X = nd.random.normal(scale=1, shape=(num_examples, num_inputs))
-y = true_w[0] * X[:, 0] + true_w[1] * X[:, 1] + true_b
-y += nd.random.normal(scale=0.01, shape=y.shape)
+features = nd.random.normal(scale=1, shape=(num_examples, num_inputs))
+labels = true_w[0] * features[:, 0] + true_w[1] * features[:, 1] + true_b
+labels += nd.random.normal(scale=0.01, shape=labels.shape)

 # 线性回归模型。
 net = nn.Sequential()
@@ -41,7 +40,7 @@ net.initialize(mx.init.Normal(sigma=1), force_reinit=True)
 trainer = gluon.Trainer(net.collect_params(), 'adagrad',
                        {'learning_rate': 0.9})
 utils.optimize(batch_size=10, trainer=trainer, num_epochs=3, decay_epoch=None,
-               log_interval=10, X=X, y=y, net=net)
+               log_interval=10, features=features, labels=labels, net=net)
 ```

 ## 小结

--- a/chapter_optimization/adagrad-scratch.md
+++ b/chapter_optimization/adagrad-scratch.md
@@ -75,9 +75,9 @@ num_inputs = 2
 num_examples = 1000
 true_w = [2, -3.4]
 true_b = 4.2
-X = nd.random.normal(scale=1, shape=(num_examples, num_inputs))
-y = true_w[0] * X[:, 0] + true_w[1] * X[:, 1] + true_b
-y += nd.random.normal(scale=0.01, shape=y.shape)
+features = nd.random.normal(scale=1, shape=(num_examples, num_inputs))
+labels = true_w[0] * features[:, 0] + true_w[1] * features[:, 1] + true_b
+labels += nd.random.normal(scale=0.01, shape=labels.shape)

 # 初始化模型参数。
 def init_params():
@@ -96,24 +96,23 @@ def init_params():

 ```{.python .input  n=3}
 net = utils.linreg
-squared_loss = utils.squared_loss
+loss = utils.squared_loss

 def optimize(batch_size, lr, num_epochs, log_interval):
    [w, b], sqrs = init_params()
-    y_vals = [squared_loss(net(X, w, b), y).mean().asnumpy()]
+    ls = [loss(net(features, w, b), labels).mean().asnumpy()]
    for epoch in range(1, num_epochs + 1):
-        for batch_i, (features, label) in enumerate(
-            utils.data_iter(batch_size, num_examples, X, y)):
+        for batch_i, (X, y) in enumerate(
+            utils.data_iter(batch_size, num_examples, features, labels)):
            with autograd.record():
-                output = net(features, w, b)
-                loss = squared_loss(output, label)
-            loss.backward()
+                l = loss(net(X, w, b), y)
+            l.backward()
            adagrad([w, b], sqrs, lr, batch_size)
            if batch_i * batch_size % log_interval == 0:
-                y_vals.append(squared_loss(net(X, w, b), y).mean().asnumpy())
+                ls.append(loss(net(features, w, b), labels).mean().asnumpy())
    print('w:', w, '\nb:', b, '\n')
-    x_vals = np.linspace(0, num_epochs, len(y_vals), endpoint=True)
-    utils.semilogy(x_vals, y_vals, 'epoch', 'loss')
+    es = np.linspace(0, num_epochs, len(ls), endpoint=True)
+    utils.semilogy(es, ls, 'epoch', 'loss')
 ```

 最终，优化所得的模型参数值与它们的真实值较接近。

--- a/chapter_optimization/adam-gluon.md
+++ b/chapter_optimization/adam-gluon.md
@@ -8,9 +8,8 @@
 %config InlineBackend.figure_format = 'retina'
 %matplotlib inline
 import mxnet as mx
-from mxnet import autograd, gluon, nd
+from mxnet import gluon, nd
 from mxnet.gluon import nn
-import numpy as np
 import sys
 sys.path.append('..')
 import utils
@@ -24,9 +23,9 @@ num_inputs = 2
 num_examples = 1000
 true_w = [2, -3.4]
 true_b = 4.2
-X = nd.random.normal(scale=1, shape=(num_examples, num_inputs))
-y = true_w[0] * X[:, 0] + true_w[1] * X[:, 1] + true_b
-y += nd.random.normal(scale=0.01, shape=y.shape)
+features = nd.random.normal(scale=1, shape=(num_examples, num_inputs))
+labels = true_w[0] * features[:, 0] + true_w[1] * features[:, 1] + true_b
+labels += nd.random.normal(scale=0.01, shape=labels.shape)

 # 线性回归模型。
 net = nn.Sequential()
@@ -39,7 +38,7 @@ net.add(nn.Dense(1))
 net.initialize(mx.init.Normal(sigma=1), force_reinit=True)
 trainer = gluon.Trainer(net.collect_params(), 'adam', {'learning_rate': 0.1})
 utils.optimize(batch_size=10, trainer=trainer, num_epochs=3, decay_epoch=None,
-               log_interval=10, X=X, y=y, net=net)
+               log_interval=10, features=features, labels=labels, net=net)
 ```

 ## 小结

--- a/chapter_optimization/adam-scratch.md
+++ b/chapter_optimization/adam-scratch.md
@@ -84,9 +84,9 @@ num_inputs = 2
 num_examples = 1000
 true_w = [2, -3.4]
 true_b = 4.2
-X = nd.random.normal(scale=1, shape=(num_examples, num_inputs))
-y = true_w[0] * X[:, 0] + true_w[1] * X[:, 1] + true_b
-y += nd.random.normal(scale=0.01, shape=y.shape)
+features = nd.random.normal(scale=1, shape=(num_examples, num_inputs))
+labels = true_w[0] * features[:, 0] + true_w[1] * features[:, 1] + true_b
+labels += nd.random.normal(scale=0.01, shape=labels.shape)

 # 初始化模型参数。
 def init_params():
@@ -107,27 +107,26 @@ def init_params():

 ```{.python .input  n=2}
 net = utils.linreg
-squared_loss = utils.squared_loss
+loss = utils.squared_loss

 def optimize(batch_size, lr, num_epochs, log_interval):
    [w, b], vs, sqrs = init_params()
-    y_vals = [squared_loss(net(X, w, b), y).mean().asnumpy()]
+    ls = [loss(net(features, w, b), labels).mean().asnumpy()]
    t = 0
    for epoch in range(1, num_epochs + 1):
-        for batch_i, (features, label) in enumerate(
-            utils.data_iter(batch_size, num_examples, X, y)):
+        for batch_i, (X, y) in enumerate(
+            utils.data_iter(batch_size, num_examples, features, labels)):
            with autograd.record():
-                output = net(features, w, b)
-                loss = squared_loss(output, label)
-            loss.backward()
+                l = loss(net(X, w, b), y)
+            l.backward()
            # 必须在调用Adam前。
            t += 1
            adam([w, b], vs, sqrs, lr, batch_size, t)
            if batch_i * batch_size % log_interval == 0:
-                y_vals.append(squared_loss(net(X, w, b), y).mean().asnumpy())
+                ls.append(loss(net(features, w, b), labels).mean().asnumpy())
    print('w:', w, '\nb:', b, '\n')
-    x_vals = np.linspace(0, num_epochs, len(y_vals), endpoint=True)
-    utils.semilogy(x_vals, y_vals, 'epoch', 'loss')
+    es = np.linspace(0, num_epochs, len(ls), endpoint=True)
+    utils.semilogy(es, ls, 'epoch', 'loss')
 ```

 最终，优化所得的模型参数值与它们的真实值较接近。

--- a/chapter_optimization/gd-sgd-gluon.md
+++ b/chapter_optimization/gd-sgd-gluon.md
@@ -24,9 +24,9 @@ num_inputs = 2
 num_examples = 1000
 true_w = [2, -3.4]
 true_b = 4.2
-X = nd.random.normal(scale=1, shape=(num_examples, num_inputs))
-y = true_w[0] * X[:, 0] + true_w[1] * X[:, 1] + true_b
-y += nd.random.normal(scale=0.01, shape=y.shape)
+features = nd.random.normal(scale=1, shape=(num_examples, num_inputs))
+labels = true_w[0] * features[:, 0] + true_w[1] * features[:, 1] + true_b
+labels += nd.random.normal(scale=0.01, shape=labels.shape)

 # 线性回归模型。
 net = nn.Sequential()
@@ -37,28 +37,27 @@ net.add(nn.Dense(1))

 ```{.python .input  n=2}
 # 优化目标函数。
-def optimize(batch_size, trainer, num_epochs, decay_epoch, log_interval, X, y,
-             net):
-    dataset = gdata.ArrayDataset(X, y)
+def optimize(batch_size, trainer, num_epochs, decay_epoch, log_interval,
+             features, labels, net):
+    dataset = gdata.ArrayDataset(features, labels)
    data_iter = gdata.DataLoader(dataset, batch_size, shuffle=True)
-    square_loss = gloss.L2Loss()
-    y_vals = [square_loss(net(X), y).mean().asnumpy()]
+    loss = gloss.L2Loss()
+    ls = [loss(net(features), labels).mean().asnumpy()]
    for epoch in range(1, num_epochs + 1): 
        # 学习率自我衰减。
        if decay_epoch and epoch > decay_epoch:
            trainer.set_learning_rate(trainer.learning_rate * 0.1)
-        for batch_i, (features, label) in enumerate(data_iter):
+        for batch_i, (X, y) in enumerate(data_iter):
            with autograd.record():
-                output = net(features)
-                loss = square_loss(output, label)
-            loss.backward()
+                l = loss(net(X), y)
+            l.backward()
            trainer.step(batch_size)
            if batch_i * batch_size % log_interval == 0:
-                y_vals.append(square_loss(net(X), y).mean().asnumpy())
+                ls.append(loss(net(features), labels).mean().asnumpy())
    # 为了便于打印，改变输出形状并转化成numpy数组。
    print('w:', net[0].weight.data(), '\nb:', net[0].bias.data(), '\n')
-    x_vals = np.linspace(0, num_epochs, len(y_vals), endpoint=True)
-    utils.semilogy(x_vals, y_vals, 'epoch', 'loss')
+    es = np.linspace(0, num_epochs, len(ls), endpoint=True)
+    utils.semilogy(es, ls, 'epoch', 'loss')
 ```

 以下几组实验分别重现了["梯度下降和随机梯度下降——从零开始"](gd-sgd-scratch.md)一节中实验结果。
@@ -67,35 +66,35 @@ def optimize(batch_size, trainer, num_epochs, decay_epoch, log_interval, X, y,
 net.initialize(mx.init.Normal(sigma=1), force_reinit=True)
 trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': 0.2})
 optimize(batch_size=1, trainer=trainer, num_epochs=3, decay_epoch=2,
-         log_interval=10, X=X, y=y, net=net)
+         log_interval=10, features=features, labels=labels, net=net)
 ```

 ```{.python .input  n=4}
 net.initialize(mx.init.Normal(sigma=1), force_reinit=True)
 trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': 0.999})
 optimize(batch_size=1000, trainer=trainer, num_epochs=3, decay_epoch=None,
-         log_interval=1000, X=X, y=y, net=net)
+         log_interval=1000, features=features, labels=labels, net=net)
 ```

 ```{.python .input  n=5}
 net.initialize(mx.init.Normal(sigma=1), force_reinit=True)
 trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': 0.2})
 optimize(batch_size=10, trainer=trainer, num_epochs=3, decay_epoch=2,
-         log_interval=10, X=X, y=y, net=net)
+         log_interval=10, features=features, labels=labels, net=net)
 ```

 ```{.python .input  n=6}
 net.initialize(mx.init.Normal(sigma=1), force_reinit=True)
 trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': 5})
 optimize(batch_size=10, trainer=trainer, num_epochs=3, decay_epoch=2,
-         log_interval=10, X=X, y=y, net=net)
+         log_interval=10, features=features, labels=labels, net=net)
 ```

 ```{.python .input  n=7}
 net.initialize(mx.init.Normal(sigma=1), force_reinit=True)
 trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': 0.002})
 optimize(batch_size=10, trainer=trainer, num_epochs=3, decay_epoch=2,
-         log_interval=10, X=X, y=y, net=net)
+         log_interval=10, features=features, labels=labels, net=net)
 ```

 ## 小结

--- a/chapter_optimization/gd-sgd-scratch.md
+++ b/chapter_optimization/gd-sgd-scratch.md
@@ -134,9 +134,9 @@ num_inputs = 2
 num_examples = 1000
 true_w = [2, -3.4]
 true_b = 4.2
-X = nd.random.normal(scale=1, shape=(num_examples, num_inputs))
-y = true_w[0] * X[:, 0] + true_w[1] * X[:, 1] + true_b
-y += nd.random.normal(scale=0.01, shape=y.shape)
+features = nd.random.normal(scale=1, shape=(num_examples, num_inputs))
+labels = true_w[0] * features[:, 0] + true_w[1] * features[:, 1] + true_b
+labels += nd.random.normal(scale=0.01, shape=labels.shape)

 # 初始化模型参数。
 def init_params():
@@ -152,16 +152,16 @@ def linreg(X, w, b):
    return nd.dot(X, w) + b 

 # 平方损失函数。
-def squared_loss(yhat, y): 
-    return (yhat - y.reshape(yhat.shape)) ** 2 / 2
+def squared_loss(y_hat, y): 
+    return (y_hat - y.reshape(y_hat.shape)) ** 2 / 2

 # 遍历数据集。
-def data_iter(batch_size, num_examples, X, y): 
-    idx = list(range(num_examples))
-    random.shuffle(idx)
+def data_iter(batch_size, num_examples, features, labels): 
+    indices = list(range(num_examples))
+    random.shuffle(indices)
    for i in range(0, num_examples, batch_size):
-        j = nd.array(idx[i: min(i + batch_size, num_examples)])
-        yield X.take(j), y.take(j)
+        j = nd.array(indices[i: min(i + batch_size, num_examples)])
+        yield features.take(j), labels.take(j)
 ```

 下面我们描述一下优化函数`optimize`。
@@ -172,26 +172,26 @@ def data_iter(batch_size, num_examples, X, y):

 ```{.python .input  n=3}
 net = linreg
+loss = squared_loss

 def optimize(batch_size, lr, num_epochs, log_interval, decay_epoch):
    w, b = init_params()
-    y_vals = [squared_loss(net(X, w, b), y).mean().asnumpy()]
+    ls = [squared_loss(net(features, w, b), labels).mean().asnumpy()]
    for epoch in range(1, num_epochs + 1):
        # 学习率自我衰减。
        if decay_epoch and epoch > decay_epoch:
            lr *= 0.1
-        for batch_i, (features, label) in enumerate(
-            data_iter(batch_size, num_examples, X, y)):
+        for batch_i, (X, y) in enumerate(
+            data_iter(batch_size, num_examples, features, labels)):
            with autograd.record():
-                output = net(features, w, b)
-                loss = squared_loss(output, label)
-            loss.backward()
+                l = loss(net(X, w, b), y)
+            l.backward()
            sgd([w, b], lr, batch_size)
            if batch_i * batch_size % log_interval == 0:
-                y_vals.append(squared_loss(net(X, w, b), y).mean().asnumpy())
+                ls.append(loss(net(features, w, b), labels).mean().asnumpy())
    print('w:', w, '\nb:', b, '\n')
-    x_vals = np.linspace(0, num_epochs, len(y_vals), endpoint=True)
-    utils.semilogy(x_vals, y_vals, 'epoch', 'loss')
+    es = np.linspace(0, num_epochs, len(ls), endpoint=True)
+    utils.semilogy(es, ls, 'epoch', 'loss')
 ```

 当批量大小为1时，优化使用的是随机梯度下降。在当前学习率下，损失函数值在早期快速下降后略有波动。这是由于随机梯度的方差在迭代过程中无法减小。当迭代周期大于2，学习率自我衰减后，损失函数值下降后较平稳。最终，优化所得的模型参数值`w`和`b`与它们的真实值[2, -3.4]和4.2较接近。

--- a/chapter_optimization/momentum-gluon.md
+++ b/chapter_optimization/momentum-gluon.md
@@ -8,9 +8,8 @@
 %config InlineBackend.figure_format = 'retina'
 %matplotlib inline
 import mxnet as mx
-from mxnet import autograd, gluon, nd
+from mxnet import gluon, nd
 from mxnet.gluon import nn
-import numpy as np
 import sys
 sys.path.append('..')
 import utils
@@ -24,9 +23,9 @@ num_inputs = 2
 num_examples = 1000
 true_w = [2, -3.4]
 true_b = 4.2
-X = nd.random.normal(scale=1, shape=(num_examples, num_inputs))
-y = true_w[0] * X[:, 0] + true_w[1] * X[:, 1] + true_b
-y += nd.random.normal(scale=0.01, shape=y.shape)
+features = nd.random.normal(scale=1, shape=(num_examples, num_inputs))
+labels = true_w[0] * features[:, 0] + true_w[1] * features[:, 1] + true_b
+labels += nd.random.normal(scale=0.01, shape=labels.shape)

 # 线性回归模型。
 net = nn.Sequential()
@@ -40,7 +39,7 @@ net.initialize(mx.init.Normal(sigma=1), force_reinit=True)
 trainer = gluon.Trainer(net.collect_params(), 'sgd',
                        {'learning_rate': 0.2, 'momentum': 0.99})
 utils.optimize(batch_size=10, trainer=trainer, num_epochs=3, decay_epoch=2,
-               log_interval=10, X=X, y=y, net=net)
+               log_interval=10, features=features, labels=labels, net=net)
 ```

 ```{.python .input}
@@ -48,7 +47,7 @@ net.initialize(mx.init.Normal(sigma=1), force_reinit=True)
 trainer = gluon.Trainer(net.collect_params(), 'sgd',
                        {'learning_rate': 0.2, 'momentum': 0.9})
 utils.optimize(batch_size=10, trainer=trainer, num_epochs=3, decay_epoch=2,
-               log_interval=10, X=X, y=y, net=net)
+               log_interval=10, features=features, labels=labels, net=net)
 ```

 ```{.python .input}
@@ -56,7 +55,7 @@ net.initialize(mx.init.Normal(sigma=1), force_reinit=True)
 trainer = gluon.Trainer(net.collect_params(), 'sgd',
                        {'learning_rate': 0.2, 'momentum': 0.5})
 utils.optimize(batch_size=10, trainer=trainer, num_epochs=3, decay_epoch=2,
-               log_interval=10, X=X, y=y, net=net)
+               log_interval=10, features=features, labels=labels, net=net)
 ```

 ## 小结

--- a/chapter_optimization/momentum-scratch.md
+++ b/chapter_optimization/momentum-scratch.md
@@ -101,9 +101,9 @@ num_inputs = 2
 num_examples = 1000
 true_w = [2, -3.4]
 true_b = 4.2
-X = nd.random.normal(scale=1, shape=(num_examples, num_inputs))
-y = true_w[0] * X[:, 0] + true_w[1] * X[:, 1] + true_b
-y += nd.random.normal(scale=0.01, shape=y.shape)
+features = nd.random.normal(scale=1, shape=(num_examples, num_inputs))
+labels = true_w[0] * features[:, 0] + true_w[1] * features[:, 1] + true_b
+labels += nd.random.normal(scale=0.01, shape=labels.shape)

 # 初始化模型参数。
 def init_params():
@@ -122,27 +122,26 @@ def init_params():

 ```{.python .input  n=3}
 net = utils.linreg
-squared_loss = utils.squared_loss
+loss = utils.squared_loss

 def optimize(batch_size, lr, mom, num_epochs, log_interval):
    [w, b], vs = init_params()
-    y_vals = [squared_loss(net(X, w, b), y).mean().asnumpy()]
+    ls = [loss(net(features, w, b), labels).mean().asnumpy()]
    for epoch in range(1, num_epochs + 1):
        # 学习率自我衰减。
        if epoch > 2:
            lr *= 0.1
-        for batch_i, (features, label) in enumerate(
-            utils.data_iter(batch_size, num_examples, X, y)):
+        for batch_i, (X, y) in enumerate(
+            utils.data_iter(batch_size, num_examples, features, labels)):
            with autograd.record():
-                output = net(features, w, b)
-                loss = squared_loss(output, label)
-            loss.backward()
+                l = loss(net(X, w, b), y)
+            l.backward()
            sgd_momentum([w, b], vs, lr, mom, batch_size)
            if batch_i * batch_size % log_interval == 0:
-                y_vals.append(squared_loss(net(X, w, b), y).mean().asnumpy())
+                ls.append(loss(net(features, w, b), labels).mean().asnumpy())
    print('w:', w, '\nb:', b, '\n')
-    x_vals = np.linspace(0, num_epochs, len(y_vals), endpoint=True)
-    utils.semilogy(x_vals, y_vals, 'epoch', 'loss')
+    es = np.linspace(0, num_epochs, len(ls), endpoint=True)
+    utils.semilogy(es, ls, 'epoch', 'loss')
 ```

 我们先将动量超参数$\gamma$（`mom`）设0.99。此时，小梯度随机梯度下降可被看作使用了特殊梯度：这个特殊梯度是最近100个时刻的$100\nabla f_\mathcal{B}(\boldsymbol{x})$的加权平均。我们观察到，损失函数值在3个迭代周期后上升。这很可能是由于特殊梯度中较大的系数100造成的。

--- a/chapter_optimization/rmsprop-gluon.md
+++ b/chapter_optimization/rmsprop-gluon.md
@@ -9,9 +9,8 @@
 %config InlineBackend.figure_format = 'retina'
 %matplotlib inline
 import mxnet as mx
-from mxnet import autograd, gluon, nd
+from mxnet import gluon, nd
 from mxnet.gluon import nn
-import numpy as np
 import sys
 sys.path.append('..')
 import utils
@@ -25,9 +24,9 @@ num_inputs = 2
 num_examples = 1000
 true_w = [2, -3.4]
 true_b = 4.2
-X = nd.random.normal(scale=1, shape=(num_examples, num_inputs))
-y = true_w[0] * X[:, 0] + true_w[1] * X[:, 1] + true_b
-y += nd.random.normal(scale=0.01, shape=y.shape)
+features = nd.random.normal(scale=1, shape=(num_examples, num_inputs))
+labels = true_w[0] * features[:, 0] + true_w[1] * features[:, 1] + true_b
+labels += nd.random.normal(scale=0.01, shape=labels.shape)

 # 线性回归模型。
 net = nn.Sequential()
@@ -41,7 +40,7 @@ net.initialize(mx.init.Normal(sigma=1), force_reinit=True)
 trainer = gluon.Trainer(net.collect_params(), 'rmsprop',
                        {'learning_rate': 0.03, 'gamma1': 0.9})
 utils.optimize(batch_size=10, trainer=trainer, num_epochs=3, decay_epoch=None,
-               log_interval=10, X=X, y=y, net=net)
+               log_interval=10, features=features, labels=labels, net=net)
 ```

 ```{.python .input}
@@ -49,7 +48,7 @@ net.collect_params().initialize(mx.init.Normal(sigma=1), force_reinit=True)
 trainer = gluon.Trainer(net.collect_params(), 'rmsprop',
                        {'learning_rate': 0.03, 'gamma1': 0.999})
 utils.optimize(batch_size=10, trainer=trainer, num_epochs=3, decay_epoch=None,
-               log_interval=10, X=X, y=y, net=net)
+               log_interval=10, features=features, labels=labels, net=net)
 ```

 ## 小结

--- a/chapter_optimization/rmsprop-scratch.md
+++ b/chapter_optimization/rmsprop-scratch.md
@@ -64,9 +64,9 @@ num_inputs = 2
 num_examples = 1000
 true_w = [2, -3.4]
 true_b = 4.2
-X = nd.random.normal(scale=1, shape=(num_examples, num_inputs))
-y = true_w[0] * X[:, 0] + true_w[1] * X[:, 1] + true_b
-y += nd.random.normal(scale=0.01, shape=y.shape)
+features = nd.random.normal(scale=1, shape=(num_examples, num_inputs))
+labels = true_w[0] * features[:, 0] + true_w[1] * features[:, 1] + true_b
+labels += nd.random.normal(scale=0.01, shape=labels.shape)

 # 初始化模型参数。
 def init_params():
@@ -85,24 +85,23 @@ def init_params():

 ```{.python .input  n=2}
 net = utils.linreg
-squared_loss = utils.squared_loss
+loss = utils.squared_loss

 def optimize(batch_size, lr, gamma, num_epochs, log_interval):
    [w, b], sqrs = init_params()
-    y_vals = [squared_loss(net(X, w, b), y).mean().asnumpy()]
+    ls = [loss(net(features, w, b), labels).mean().asnumpy()]
    for epoch in range(1, num_epochs + 1):
-        for batch_i, (features, label) in enumerate(
-            utils.data_iter(batch_size, num_examples, X, y)):
+        for batch_i, (X, y) in enumerate(
+            utils.data_iter(batch_size, num_examples, features, labels)):
            with autograd.record():
-                output = net(features, w, b)
-                loss = squared_loss(output, label)
-            loss.backward()
+                l = loss(net(X, w, b), y)
+            l.backward()
            rmsprop([w, b], sqrs, lr, gamma, batch_size)
            if batch_i * batch_size % log_interval == 0:
-                y_vals.append(squared_loss(net(X, w, b), y).mean().asnumpy())
+                ls.append(loss(net(features, w, b), labels).mean().asnumpy())
    print('w:', w, '\nb:', b, '\n')
-    x_vals = np.linspace(0, num_epochs, len(y_vals), endpoint=True)
-    utils.semilogy(x_vals, y_vals, 'epoch', 'loss')
+    es = np.linspace(0, num_epochs, len(ls), endpoint=True)
+    utils.semilogy(es, ls, 'epoch', 'loss')
 ```

 我们将初始学习率设为0.03，并将$\gamma$（`gamma`）设为0.9。此时，变量$\boldsymbol{s}$可看作是最近$1/(1-0.9) = 10$个时刻的平方项$\boldsymbol{g} \odot \boldsymbol{g}$的加权平均。我们观察到，损失函数在迭代后期较震荡。

--- a/img/gd_and_overshooting.svg
+++ b/img/gd_and_overshooting.svg
 <?xml version="1.0" encoding="UTF-8" standalone="no"?>
 <!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
-<svg xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:xl="http://www.w3.org/1999/xlink" version="1.1" xmlns="http://www.w3.org/2000/svg" viewBox="5 35 380 165" width="380" height="165">
+<svg xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:xl="http://www.w3.org/1999/xlink" version="1.1" xmlns="http://www.w3.org/2000/svg" viewBox="7 35 378 161" width="378" height="161">
  <defs>
    <marker orient="auto" overflow="visible" markerUnits="strokeWidth" id="StickArrow_Marker" stroke-linejoin="miter" stroke-miterlimit="10" viewBox="-1 -4 10 8" markerWidth="10" markerHeight="8" color="black">
      <g>
        <path d="M 8 0 L 0 0 M 0 -3 L 8 0 L 0 3" fill="none" stroke="currentColor" stroke-width="1"/>
      </g>
    </marker>
-    <font-face font-family="Arial" font-size="12" panose-1="2 11 6 4 2 2 2 9 2 4" units-per-em="1000" underline-position="-105.95703" underline-thickness="73.24219" slope="-1e3" x-height="518.5547" cap-height="715.8203" ascent="905.2734" descent="-211.91406" font-style="italic" font-weight="400">
+    <font-face font-family="Arial" font-size="9" panose-1="2 11 6 4 2 2 2 9 2 4" units-per-em="1000" underline-position="-105.95703" underline-thickness="73.24219" slope="-1333.3333" x-height="518.5547" cap-height="715.8203" ascent="905.2734" descent="-211.91406" font-style="italic" font-weight="400">
      <font-face-src>
        <font-face-name name="Arial-ItalicMT"/>
      </font-face-src>
    </font-face>
-    <marker orient="auto" overflow="visible" markerUnits="strokeWidth" id="FilledArrow_Marker" stroke-linejoin="miter" stroke-miterlimit="10" viewBox="-1 -4 8 8" markerWidth="8" markerHeight="8" color="#5b7daa">
+    <marker orient="auto" overflow="visible" markerUnits="strokeWidth" id="FilledArrow_Marker" stroke-linejoin="miter" stroke-miterlimit="10" viewBox="-1 -4 10 8" markerWidth="10" markerHeight="8" color="#5b7daa">
      <g>
-        <path d="M 5.866667 0 L 0 -2.2 L 0 2.2 Z" fill="currentColor" stroke="currentColor" stroke-width="1"/>
+        <path d="M 8 0 L 0 -3 L 0 3 Z" fill="currentColor" stroke="currentColor" stroke-width="1"/>
      </g>
    </marker>
-    <marker orient="auto" overflow="visible" markerUnits="strokeWidth" id="Ball_Marker" stroke-linejoin="miter" stroke-miterlimit="10" viewBox="-6 -4 7 8" markerWidth="7" markerHeight="8" color="#5b7daa">
+    <marker orient="auto" overflow="visible" markerUnits="strokeWidth" id="Ball_Marker" stroke-linejoin="miter" stroke-miterlimit="10" viewBox="-7 -4 8 8" markerWidth="8" markerHeight="8" color="#5b7daa">
      <g>
-        <circle cx="-2.199999" cy="0" r="2.1999980557011" fill="none" stroke="currentColor" stroke-width="1"/>
+        <circle cx="-2.9999986" cy="0" r="2.99999734868332" fill="none" stroke="currentColor" stroke-width="1"/>
      </g>
    </marker>
  </defs>
  <metadata> Produced by OmniGraffle 7.7.1 
-    <dc:date>2018-04-01 20:05:43 +0000</dc:date>
+    <dc:date>2018-05-13 01:22:49 +0000</dc:date>
  </metadata>
  <g id="Canvas_1" fill-opacity="1" stroke-dasharray="none" stroke="none" stroke-opacity="1" fill="none">
    <title>Canvas 1</title>
@@ -37,26 +37,26 @@
        <line x1="35" y1="173" x2="173.12344" y2="173" marker-end="url(#StickArrow_Marker)" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-dasharray="4.0,4.0" stroke-width="1"/>
      </g>
      <g id="Graphic_17">
-        <text transform="translate(10 41.692383)" fill="black">
-          <tspan font-family="Arial" font-size="12" font-style="italic" font-weight="400" fill="black" x=".33691406" y="11">f(x)</tspan>
+        <text transform="translate(12.5 41.741455)" fill="black">
+          <tspan font-family="Arial" font-size="9" font-style="italic" font-weight="400" fill="black" x=".002685547" y="8">f(x)</tspan>
        </text>
      </g>
      <g id="Graphic_16">
-        <text transform="translate(170.5 179.80371)" fill="black">
-          <tspan font-family="Arial" font-size="12" font-style="italic" font-weight="400" fill="black" x="0" y="11">x</tspan>
+        <text transform="translate(171 179.85278)" fill="black">
+          <tspan font-family="Arial" font-size="9" font-style="italic" font-weight="400" fill="black" x=".25" y="8">x</tspan>
        </text>
      </g>
      <g id="Line_15">
-        <path d="M 58.5 60 C 58.5 60 79.024875 156.25132 106.09766 156.07812 C 133.17044 155.90493 154.06055 59.38867 154.06055 59.38867" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1.5"/>
+        <path d="M 58.5 60 C 58.5 60 79.024875 156.25132 106.09766 156.07812 C 133.17044 155.90493 154.06055 59.38867 154.06055 59.38867" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
      </g>
      <g id="Line_14">
-        <line x1="61.74284" y1="72.97964" x2="64.23549" y2="82.310445" marker-end="url(#FilledArrow_Marker)" marker-start="url(#Ball_Marker)" stroke="#5b7daa" stroke-linecap="round" stroke-linejoin="round" stroke-width="1.5"/>
+        <line x1="61.523466" y1="72.15844" x2="64.62263" y2="83.75963" marker-end="url(#FilledArrow_Marker)" marker-start="url(#Ball_Marker)" stroke="#5b7daa" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
      </g>
      <g id="Line_13">
-        <line x1="69.28829" y1="99.12173" x2="70.72686" y2="103.38009" marker-end="url(#FilledArrow_Marker)" marker-start="url(#Ball_Marker)" stroke="#5b7daa" stroke-linecap="round" stroke-linejoin="round" stroke-width="1.5"/>
+        <line x1="69.01625" y1="98.31644" x2="71.20694" y2="104.80119" marker-end="url(#FilledArrow_Marker)" marker-start="url(#Ball_Marker)" stroke="#5b7daa" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
      </g>
      <g id="Line_12">
-        <line x1="76.51754" y1="118.90492" x2="77.91111" y2="122.14559" marker-end="url(#FilledArrow_Marker)" marker-start="url(#Ball_Marker)" stroke="#5b7daa" stroke-linecap="round" stroke-linejoin="round" stroke-width="1.5"/>
+        <line x1="76.18175" y1="118.12405" x2="78.50369" y2="123.52359" marker-end="url(#FilledArrow_Marker)" marker-start="url(#Ball_Marker)" stroke="#5b7daa" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
      </g>
      <g id="Line_11">
        <line x1="235.5" y1="172.5" x2="235.5" y2="46.28867" marker-end="url(#StickArrow_Marker)" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-dasharray="4.0,4.0" stroke-width="1"/>
@@ -65,29 +65,29 @@
        <line x1="235.5" y1="173" x2="373.62344" y2="173" marker-end="url(#StickArrow_Marker)" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-dasharray="4.0,4.0" stroke-width="1"/>
      </g>
      <g id="Graphic_9">
-        <text transform="translate(210.5 41.692383)" fill="black">
-          <tspan font-family="Arial" font-size="12" font-style="italic" font-weight="400" fill="black" x=".33691406" y="11">f(x)</tspan>
+        <text transform="translate(213 41.741455)" fill="black">
+          <tspan font-family="Arial" font-size="9" font-style="italic" font-weight="400" fill="black" x=".002685547" y="8">f(x)</tspan>
        </text>
      </g>
      <g id="Graphic_8">
-        <text transform="translate(371 179.80371)" fill="black">
-          <tspan font-family="Arial" font-size="12" font-style="italic" font-weight="400" fill="black" x="0" y="11">x</tspan>
+        <text transform="translate(371.5 179.85278)" fill="black">
+          <tspan font-family="Arial" font-size="9" font-style="italic" font-weight="400" fill="black" x=".25" y="8">x</tspan>
        </text>
      </g>
      <g id="Line_7">
-        <path d="M 259 60 C 259 60 279.52488 156.25132 306.59766 156.07812 C 333.67044 155.90493 354.56055 59.38867 354.56055 59.38867" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1.5"/>
+        <path d="M 259 60 C 259 60 279.52488 156.25132 306.59766 156.07812 C 333.67044 155.90493 354.56055 59.38867 354.56055 59.38867" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
      </g>
      <g id="Line_6">
-        <line x1="298.55188" y1="144.89358" x2="317.14566" y2="141.40002" marker-end="url(#FilledArrow_Marker)" marker-start="url(#Ball_Marker)" stroke="#5b7daa" stroke-linecap="round" stroke-linejoin="round" stroke-width="1.5"/>
+        <line x1="297.7165" y1="145.05053" x2="318.61986" y2="141.12303" marker-end="url(#FilledArrow_Marker)" marker-start="url(#Ball_Marker)" stroke="#5b7daa" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
      </g>
      <g id="Line_5">
-        <line x1="320.11955" y1="136.01634" x2="287.00124" y2="125.30652" marker-end="url(#FilledArrow_Marker)" marker-start="url(#Ball_Marker)" stroke="#5b7daa" stroke-linecap="round" stroke-linejoin="round" stroke-width="1.5"/>
+        <line x1="320.9283" y1="136.27788" x2="285.574" y2="124.84498" marker-end="url(#FilledArrow_Marker)" marker-start="url(#Ball_Marker)" stroke="#5b7daa" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
      </g>
      <g id="Line_4">
-        <line x1="285.25747" y1="120.53153" x2="331.5463" y2="109.44607" marker-end="url(#FilledArrow_Marker)" marker-start="url(#Ball_Marker)" stroke="#5b7daa" stroke-linecap="round" stroke-linejoin="round" stroke-width="1.5"/>
+        <line x1="284.43085" y1="120.7295" x2="333.00505" y2="109.09672" marker-end="url(#FilledArrow_Marker)" marker-start="url(#Ball_Marker)" stroke="#5b7daa" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
      </g>
      <g id="Line_3">
-        <line x1="334.1477" y1="104.24621" x2="274.7326" y2="84.50641" marker-end="url(#FilledArrow_Marker)" marker-start="url(#Ball_Marker)" stroke="#5b7daa" stroke-linecap="round" stroke-linejoin="round" stroke-width="1.5"/>
+        <line x1="334.95433" y1="104.5142" x2="273.30912" y2="84.03348" marker-end="url(#FilledArrow_Marker)" marker-start="url(#Ball_Marker)" stroke="#5b7daa" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
      </g>
    </g>
  </g>

--- a/img/momentum-move.svg
+++ b/img/momentum-move.svg
 <?xml version="1.0" encoding="UTF-8" standalone="no"?>
 <!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
-<svg xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:xl="http://www.w3.org/1999/xlink" version="1.1" xmlns="http://www.w3.org/2000/svg" viewBox="6.5 5 306.14414 158.5" width="306.14414" height="158.5">
+<svg xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:xl="http://www.w3.org/1999/xlink" version="1.1" xmlns="http://www.w3.org/2000/svg" viewBox="7.5 5 305.14414 153.5" width="305.14414" height="153.5">
  <defs>
-    <marker orient="auto" overflow="visible" markerUnits="strokeWidth" id="StickArrow_Marker" stroke-linejoin="miter" stroke-miterlimit="10" viewBox="-1 -4 8 8" markerWidth="8" markerHeight="8" color="#5b7daa">
+    <marker orient="auto" overflow="visible" markerUnits="strokeWidth" id="StickArrow_Marker" stroke-linejoin="miter" stroke-miterlimit="10" viewBox="-1 -4 10 8" markerWidth="10" markerHeight="8" color="#5b7daa">
      <g>
-        <path d="M 5.866667 0 L 0 0 M 0 -2.2 L 5.866667 0 L 0 2.2" fill="none" stroke="currentColor" stroke-width="1"/>
+        <path d="M 8 0 L 0 0 M 0 -3 L 8 0 L 0 3" fill="none" stroke="currentColor" stroke-width="1"/>
      </g>
    </marker>
    <marker orient="auto" overflow="visible" markerUnits="strokeWidth" id="StickArrow_Marker_2" stroke-linejoin="miter" stroke-miterlimit="10" viewBox="-1 -4 10 8" markerWidth="10" markerHeight="8" color="black">
@@ -12,95 +12,95 @@
        <path d="M 8 0 L 0 0 M 0 -3 L 8 0 L 0 3" fill="none" stroke="currentColor" stroke-width="1"/>
      </g>
    </marker>
-    <font-face font-family="Arial" font-size="12" panose-1="2 11 6 4 2 2 2 9 2 4" units-per-em="1000" underline-position="-105.95703" underline-thickness="73.24219" slope="-1e3" x-height="518.5547" cap-height="715.8203" ascent="905.2734" descent="-211.91406" font-style="italic" font-weight="400">
+    <font-face font-family="Arial" font-size="9" panose-1="2 11 6 4 2 2 2 9 2 4" units-per-em="1000" underline-position="-105.95703" underline-thickness="73.24219" slope="-1333.3333" x-height="518.5547" cap-height="715.8203" ascent="905.2734" descent="-211.91406" font-style="italic" font-weight="400">
      <font-face-src>
        <font-face-name name="Arial-ItalicMT"/>
      </font-face-src>
    </font-face>
-    <font-face font-family="Arial" font-size="8" panose-1="2 11 6 4 2 2 2 2 2 4" units-per-em="1000" underline-position="-105.95703" underline-thickness="73.24219" slope="0" x-height="518.5547" cap-height="716.3086" ascent="905.2734" descent="-211.91406" font-weight="400">
+    <font-face font-family="Arial" font-size="7" panose-1="2 11 6 4 2 2 2 2 2 4" units-per-em="1000" underline-position="-105.95703" underline-thickness="73.24219" slope="0" x-height="518.5547" cap-height="716.3086" ascent="905.2734" descent="-211.91406" font-weight="400">
      <font-face-src>
        <font-face-name name="ArialMT"/>
      </font-face-src>
    </font-face>
  </defs>
  <metadata> Produced by OmniGraffle 7.7.1 
-    <dc:date>2018-04-05 18:37:56 +0000</dc:date>
+    <dc:date>2018-05-13 01:23:28 +0000</dc:date>
  </metadata>
  <g id="Canvas_1" fill-opacity="1" stroke-dasharray="none" stroke="none" stroke-opacity="1" fill="none">
    <title>Canvas 1</title>
    <g id="Canvas_1: Layer 1">
      <title>Layer 1</title>
-      <g id="Graphic_14">
+      <g id="Graphic_21">
        <ellipse cx="163.69034" cy="72.75" rx="123.690538554121" ry="51.7500826914231" fill="white"/>
        <ellipse cx="163.69034" cy="72.75" rx="123.690538554121" ry="51.7500826914231" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
      </g>
-      <g id="Graphic_13">
+      <g id="Graphic_20">
        <ellipse cx="163.69034" cy="72.75" rx="67.8239720122439" ry="21.5625344547596" fill="white"/>
        <ellipse cx="163.69034" cy="72.75" rx="67.8239720122439" ry="21.5625344547596" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
      </g>
-      <g id="Graphic_12">
+      <g id="Graphic_19">
        <ellipse cx="164.86648" cy="72.84801" rx="15.877866280323" ry="5.19461057319209" fill="white"/>
        <ellipse cx="164.86648" cy="72.84801" rx="15.877866280323" ry="5.19461057319209" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
      </g>
-      <g id="Line_11">
-        <line x1="64.50284" y1="76.627575" x2="69.95741" y2="61.879636" marker-end="url(#StickArrow_Marker)" stroke="#5b7daa" stroke-linecap="round" stroke-linejoin="round" stroke-width="1.5"/>
+      <g id="Line_18">
+        <line x1="64.50284" y1="76.627575" x2="70.47774" y2="60.472776" marker-end="url(#StickArrow_Marker)" stroke="#5b7daa" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
      </g>
-      <g id="Line_9">
-        <path d="M 74.107955 52.167614 L 75.546875 67.35156 L 77.24887 79.1592" marker-end="url(#StickArrow_Marker)" stroke="#5b7daa" stroke-linecap="round" stroke-linejoin="round" stroke-width="1.5"/>
+      <g id="Line_17">
+        <path d="M 74.107955 52.167614 L 75.546875 67.35156 L 77.46287 80.64385" marker-end="url(#StickArrow_Marker)" stroke="#5b7daa" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
      </g>
-      <g id="Line_8">
-        <line x1="78.60427" y1="88.56241" x2="89.16032" y2="64.82744" marker-end="url(#StickArrow_Marker)" stroke="#5b7daa" stroke-linecap="round" stroke-linejoin="round" stroke-width="1.5"/>
+      <g id="Line_16">
+        <line x1="78.60427" y1="88.56241" x2="89.76987" y2="63.456875" marker-end="url(#StickArrow_Marker)" stroke="#5b7daa" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
      </g>
-      <g id="Line_7">
-        <line x1="93.79292" y1="54.411155" x2="99.21977" y2="70.88387" marker-end="url(#StickArrow_Marker)" stroke="#5b7daa" stroke-linecap="round" stroke-linejoin="round" stroke-width="1.5"/>
+      <g id="Line_15">
+        <line x1="93.79292" y1="54.411155" x2="99.68927" y2="72.309" marker-end="url(#StickArrow_Marker)" stroke="#5b7daa" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
      </g>
-      <g id="Line_6">
-        <line x1="102.53125" y1="81.79231" x2="106.39071" y2="74.86708" marker-end="url(#StickArrow_Marker)" stroke="#5b7daa" stroke-linecap="round" stroke-linejoin="round" stroke-width="1.5"/>
+      <g id="Line_14">
+        <line x1="102.53125" y1="81.79231" x2="107.12092" y2="73.55682" marker-end="url(#StickArrow_Marker)" stroke="#5b7daa" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
      </g>
-      <g id="Line_5">
-        <line x1="111.94034" y1="65.46653" x2="115.32754" y2="70.911195" marker-end="url(#StickArrow_Marker)" stroke="#5b7daa" stroke-linecap="round" stroke-linejoin="round" stroke-width="1.5"/>
+      <g id="Line_13">
+        <line x1="111.94034" y1="65.46653" x2="116.1199" y2="72.18484" marker-end="url(#StickArrow_Marker)" stroke="#5b7daa" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
      </g>
-      <g id="Line_4">
-        <line x1="121.34943" y1="79.22334" x2="122.94727" y2="77.52486" marker-end="url(#StickArrow_Marker)" stroke="#5b7daa" stroke-linecap="round" stroke-linejoin="round" stroke-width="1.5"/>
+      <g id="Line_12">
+        <line x1="121.34943" y1="79.22334" x2="123.97507" y2="76.43233" marker-end="url(#StickArrow_Marker)" stroke="#5b7daa" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
      </g>
-      <g id="Line_3">
-        <line x1="130.75852" y1="69.36708" x2="132.21151" y2="70.85814" marker-end="url(#StickArrow_Marker)" stroke="#5b7daa" stroke-linecap="round" stroke-linejoin="round" stroke-width="1.5"/>
+      <g id="Line_11">
+        <line x1="130.75852" y1="69.36708" x2="133.25837" y2="71.93243" marker-end="url(#StickArrow_Marker)" stroke="#5b7daa" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
      </g>
-      <g id="Graphic_19">
+      <g id="Graphic_10">
        <circle cx="63.38367" cy="78.88367" r="2.38367080105348" fill="white"/>
-        <circle cx="63.38367" cy="78.88367" r="2.38367080105348" stroke="#5b7daa" stroke-linecap="round" stroke-linejoin="round" stroke-width="1.5"/>
+        <circle cx="63.38367" cy="78.88367" r="2.38367080105348" stroke="#5b7daa" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
      </g>
-      <g id="Line_32">
+      <g id="Line_9">
        <line x1="28.5" y1="139.5" x2="292.34414" y2="139.5" marker-end="url(#StickArrow_Marker_2)" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-dasharray="4.0,4.0" stroke-width="1"/>
      </g>
-      <g id="Line_33">
+      <g id="Line_8">
        <line x1="28.5" y1="139.5" x2="28.5" y2="16.730078" marker-end="url(#StickArrow_Marker_2)" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-dasharray="4.0,4.0" stroke-width="1"/>
      </g>
-      <g id="Graphic_35">
-        <text transform="translate(288.05273 141.86914)" fill="black">
-          <tspan font-family="Arial" font-size="12" font-style="italic" font-weight="400" fill="black" x=".27539062" y="11">x</tspan>
-          <tspan font-family="Arial" font-size="8" font-weight="400" fill="black" y="14">1</tspan>
+      <g id="Graphic_7">
+        <text transform="translate(289.05273 141.8855)" fill="black">
+          <tspan font-family="Arial" font-size="9" font-style="italic" font-weight="400" fill="black" x=".3034668" y="8">x</tspan>
+          <tspan font-family="Arial" font-size="7" font-weight="400" fill="black" y="10">1</tspan>
        </text>
      </g>
-      <g id="Graphic_36">
-        <text transform="translate(11.5 10.369141)" fill="black">
-          <tspan font-family="Arial" font-size="12" font-style="italic" font-weight="400" fill="black" x=".27539062" y="11">x</tspan>
-          <tspan font-family="Arial" font-size="8" font-weight="400" fill="black" y="14">2</tspan>
+      <g id="Graphic_6">
+        <text transform="translate(12.5 10.385498)" fill="black">
+          <tspan font-family="Arial" font-size="9" font-style="italic" font-weight="400" fill="black" x=".3034668" y="8">x</tspan>
+          <tspan font-family="Arial" font-size="7" font-weight="400" fill="black" y="10">2</tspan>
        </text>
      </g>
-      <g id="Graphic_37">
-        <text transform="translate(268.55273 30.80371)" fill="black">
-          <tspan font-family="Arial" font-size="12" font-style="italic" font-weight="400" fill="black" x=".32128906" y="11">f = 10</tspan>
+      <g id="Graphic_5">
+        <text transform="translate(268.8 32.752784)" fill="black">
+          <tspan font-family="Arial" font-size="9" font-style="italic" font-weight="400" fill="black" x=".1159668" y="8">f = 10</tspan>
        </text>
      </g>
-      <g id="Graphic_38">
-        <text transform="translate(183.19602 60.89462)" fill="black">
-          <tspan font-family="Arial" font-size="12" font-style="italic" font-weight="400" fill="black" x=".15820312" y="11">f = 1</tspan>
+      <g id="Graphic_4">
+        <text transform="translate(186.19602 60.94369)" fill="black">
+          <tspan font-family="Arial" font-size="9" font-style="italic" font-weight="400" fill="black" x=".11865234" y="8">f = 1</tspan>
        </text>
      </g>
-      <g id="Graphic_39">
-        <text transform="translate(222.79261 48.30371)" fill="black">
-          <tspan font-family="Arial" font-size="12" font-style="italic" font-weight="400" fill="black" x=".15820312" y="11">f = 5</tspan>
+      <g id="Graphic_3">
+        <text transform="translate(222.6 50.152784)" fill="black">
+          <tspan font-family="Arial" font-size="9" font-style="italic" font-weight="400" fill="black" x=".11865234" y="8">f = 5</tspan>
        </text>
      </g>
    </g>

--- a/utils.py
+++ b/utils.py
@@ -3,7 +3,7 @@ from mxnet import gluon
 from mxnet import autograd
 from mxnet import nd
 from mxnet import image
-from mxnet.gluon import nn
+from mxnet.gluon import nn, data as gdata, loss as gloss
 import mxnet as mx
 import numpy as np
 from time import time
@@ -357,13 +357,13 @@ def set_fig_size(mpl, figsize=(3.5, 2.5)):
    mpl.rcParams['figure.figsize'] = figsize


-def data_iter(batch_size, num_examples, X, y):
+def data_iter(batch_size, num_examples, features, labels): 
    """遍历数据集。"""
-    idx = list(range(num_examples))
-    random.shuffle(idx)
+    indices = list(range(num_examples))
+    random.shuffle(indices)
    for i in range(0, num_examples, batch_size):
-        j = nd.array(idx[i: min(i + batch_size, num_examples)])
-        yield X.take(j), y.take(j)
+        j = nd.array(indices[i: min(i + batch_size, num_examples)])
+        yield features.take(j), labels.take(j)


 def linreg(X, w, b):
@@ -371,33 +371,33 @@ def linreg(X, w, b):
    return nd.dot(X, w) + b


-def squared_loss(yhat, y):
+def squared_loss(y_hat, y):
    """平方损失函数。"""
-    return (yhat - y.reshape(yhat.shape)) ** 2 / 2
+    return (y_hat - y.reshape(y_hat.shape)) ** 2 / 2


-def optimize(batch_size, trainer, num_epochs, decay_epoch, log_interval, X, y,
-             net):
+def optimize(batch_size, trainer, num_epochs, decay_epoch, log_interval,
+             features, labels, net):
    """优化目标函数。"""
-    dataset = gluon.data.ArrayDataset(X, y)
-    data_iter = gluon.data.DataLoader(dataset, batch_size, shuffle=True)
-    square_loss = gluon.loss.L2Loss()
-    y_vals = [square_loss(net(X), y).mean().asnumpy()]
+    dataset = gdata.ArrayDataset(features, labels)
+    data_iter = gdata.DataLoader(dataset, batch_size, shuffle=True)
+    loss = gloss.L2Loss()
+    ls = [loss(net(features), labels).mean().asnumpy()]
    for epoch in range(1, num_epochs + 1): 
        # 学习率自我衰减。
        if decay_epoch and epoch > decay_epoch:
            trainer.set_learning_rate(trainer.learning_rate * 0.1)
-        for batch_i, (features, label) in enumerate(data_iter):
+        for batch_i, (X, y) in enumerate(data_iter):
            with autograd.record():
-                output = net(features)
-                loss = square_loss(output, label)
-            loss.backward()
+                l = loss(net(X), y)
+            l.backward()
            trainer.step(batch_size)
            if batch_i * batch_size % log_interval == 0:
-                y_vals.append(square_loss(net(X), y).mean().asnumpy())
+                ls.append(loss(net(features), labels).mean().asnumpy())
+    # 为了便于打印，改变输出形状并转化成numpy数组。
    print('w:', net[0].weight.data(), '\nb:', net[0].bias.data(), '\n')
-    x_vals = np.linspace(0, num_epochs, len(y_vals), endpoint=True)
-    semilogy(x_vals, y_vals, 'epoch', 'loss')
+    es = np.linspace(0, num_epochs, len(ls), endpoint=True)
+    semilogy(es, ls, 'epoch', 'loss')


 def semilogy(x_vals, y_vals, x_label, y_label, figsize=(3.5, 2.5)):