refactor opt code

ae48f45b · Aston Zhang · bb73c921 · ae48f45b · ae48f45b · ae48f45b
13 changed file
--- a/chapter_optimization/adadelta-gluon.md
+++ b/chapter_optimization/adadelta-gluon.md
@@ -6,12 +6,12 @@
 import mxnet as mx
 from mxnet import autograd
 from mxnet import gluon
-from mxnet import ndarray as nd
-import numpy as np
+from mxnet import nd
 import random

-mx.random.seed(1)
+# 为方便比较同一优化算法的从零开始实现和Gluon实现，将输出保持确定。
 random.seed(1)
+mx.random.seed(1)

 # 生成数据集。
 num_inputs = 2
@@ -21,56 +21,30 @@ true_b = 4.2
 X = nd.random_normal(scale=1, shape=(num_examples, num_inputs))
 y = true_w[0] * X[:, 0] + true_w[1] * X[:, 1] + true_b
 y += .01 * nd.random_normal(scale=1, shape=y.shape)
-dataset = gluon.data.ArrayDataset(X, y)

+# 创建模型和定义损失函数。
 net = gluon.nn.Sequential()
 net.add(gluon.nn.Dense(1))
-square_loss = gluon.loss.L2Loss()
 ```

 我们需要在`gluon.Trainer`中指定优化算法名称`adadelta`并设置rho参数。

 ```{.python .input  n=2}
 %matplotlib inline
-import matplotlib as mpl
-mpl.rcParams['figure.dpi']= 120
-import matplotlib.pyplot as plt
-
-def train(batch_size, rho, epochs, period):
-    assert period >= batch_size and period % batch_size == 0
-    net.collect_params().initialize(mx.init.Normal(sigma=1), force_reinit=True)
-    # Adadelta。
-    trainer = gluon.Trainer(net.collect_params(), 'adadelta',
-                            {'rho': rho})
-    data_iter = gluon.data.DataLoader(dataset, batch_size, shuffle=True)
-    total_loss = [np.mean(square_loss(net(X), y).asnumpy())]
-    
-    for epoch in range(1, epochs + 1):
-        for batch_i, (data, label) in enumerate(data_iter):
-            with autograd.record():
-                output = net(data)
-                loss = square_loss(output, label)
-            loss.backward()
-            trainer.step(batch_size)
-
-            if batch_i * batch_size % period == 0:
-                total_loss.append(np.mean(square_loss(net(X), y).asnumpy()))
-        print("Batch size %d, Epoch %d, loss %.4e" % 
-              (batch_size, epoch, total_loss[-1]))
-
-    print('w:', np.reshape(net[0].weight.data().asnumpy(), (1, -1)), 
-          'b:', net[0].bias.data().asnumpy()[0], '\n')
-    x_axis = np.linspace(0, epochs, len(total_loss), endpoint=True)
-    plt.semilogy(x_axis, total_loss)
-    plt.xlabel('epoch')
-    plt.ylabel('loss')
-    plt.show()
+%config InlineBackend.figure_format = 'retina'
+import numpy as np
+import sys
+sys.path.append('..')
+import utils
 ```

 使用Adadelta，最终学到的参数值与真实值较接近。

 ```{.python .input  n=3}
-train(batch_size=10, rho=0.9999, epochs=3, period=10)
+net.collect_params().initialize(mx.init.Normal(sigma=1), force_reinit=True)
+trainer = gluon.Trainer(net.collect_params(), 'adadelta', {'rho': 0.9999})
+utils.optimize(batch_size=10, trainer=trainer, num_epochs=3, decay_epoch=None,
+               log_interval=10, X=X, y=y, net=net, print_lr=False)
 ```

 ## 结论

--- a/chapter_optimization/adadelta-scratch.md
+++ b/chapter_optimization/adadelta-scratch.md
@@ -47,12 +47,13 @@ def adadelta(params, sqrs, deltas, rho, batch_size):
 实验中，我们以线性回归为例。其中真实参数`w`为[2, -3.4]，`b`为4.2。我们把算法中基于指数加权移动平均的变量初始化为和参数形状相同的零张量。

 ```{.python .input  n=1}
-from mxnet import ndarray as nd
 import mxnet as mx
 from mxnet import autograd
 from mxnet import gluon
+from mxnet import nd
 import random

+# 为方便比较同一优化算法的从零开始实现和Gluon实现，将输出保持确定。
 mx.random.seed(1)
 random.seed(1)

@@ -64,16 +65,6 @@ true_b = 4.2
 X = nd.random_normal(scale=1, shape=(num_examples, num_inputs))
 y = true_w[0] * X[:, 0] + true_w[1] * X[:, 1] + true_b
 y += .01 * nd.random_normal(scale=1, shape=y.shape)
-dataset = gluon.data.ArrayDataset(X, y)
-
-# 构造迭代器。
-import random
-def data_iter(batch_size):
-    idx = list(range(num_examples))
-    random.shuffle(idx)
-    for batch_i, i in enumerate(range(0, num_examples, batch_size)):
-        j = nd.array(idx[i: min(i + batch_size, num_examples)])
-        yield batch_i, X.take(j), y.take(j)

 # 初始化模型参数。
 def init_params():
@@ -88,46 +79,45 @@ def init_params():
        sqrs.append(param.zeros_like())
        deltas.append(param.zeros_like())
    return params, sqrs, deltas
-
-# 线性回归模型。
-def net(X, w, b):
-    return nd.dot(X, w) + b
-
-# 损失函数。
-def square_loss(yhat, y):
-    return (yhat - y.reshape(yhat.shape)) ** 2 / 2
 ```

 接下来定义训练函数。当epoch大于2时（epoch从1开始计数），学习率以自乘0.1的方式自我衰减。训练函数的period参数说明，每次采样过该数目的数据点后，记录当前目标函数值用于作图。例如，当period和batch_size都为10时，每次迭代后均会记录目标函数值。

 ```{.python .input  n=2}
 %matplotlib inline
+%config InlineBackend.figure_format = 'retina'
 import matplotlib as mpl
-mpl.rcParams['figure.dpi']= 120
 import matplotlib.pyplot as plt
 import numpy as np

-def train(batch_size, rho, epochs, period):
-    assert period >= batch_size and period % batch_size == 0
-    [w, b], sqrs, deltas = init_params()
-    total_loss = [np.mean(square_loss(net(X, w, b), y).asnumpy())]
+import sys
+sys.path.append('..')
+import utils
+
+net = utils.linreg
+squared_loss = utils.squared_loss

-    # 注意epoch从1开始计数。
-    for epoch in range(1, epochs + 1):
-        for batch_i, data, label in data_iter(batch_size):
+def optimize(batch_size, rho, num_epochs, log_interval):
+    [w, b], sqrs, deltas = init_params()
+    y_vals = [nd.mean(squared_loss(net(X, w, b), y)).asnumpy()]
+    print('batch size', batch_size)
+    for epoch in range(1, num_epochs + 1):
+        for batch_i, features, label in utils.data_iter(
+            batch_size, num_examples, random, X, y):
            with autograd.record():
-                output = net(data, w, b)
-                loss = square_loss(output, label)
+                output = net(features, w, b)
+                loss = squared_loss(output, label)
            loss.backward()
            adadelta([w, b], sqrs, deltas, rho, batch_size)
-            if batch_i * batch_size % period == 0:
-                total_loss.append(np.mean(square_loss(net(X, w, b), y).asnumpy()))
-        print("Batch size %d, Epoch %d, loss %.4e" % 
-              (batch_size, epoch, total_loss[-1]))
+            if batch_i * batch_size % log_interval == 0:
+                y_vals.append(
+                    nd.mean(squared_loss(net(X, w, b), y)).asnumpy())
+        print('epoch %d, loss %.4e' % (epoch, y_vals[-1]))
    print('w:', np.reshape(w.asnumpy(), (1, -1)), 
          'b:', b.asnumpy()[0], '\n')
-    x_axis = np.linspace(0, epochs, len(total_loss), endpoint=True)
-    plt.semilogy(x_axis, total_loss)
+    x_vals = np.linspace(0, num_epochs, len(y_vals), endpoint=True)
+    utils.set_fig_size(mpl)
+    plt.semilogy(x_vals, y_vals)
    plt.xlabel('epoch')
    plt.ylabel('loss')
    plt.show()
@@ -136,7 +126,7 @@ def train(batch_size, rho, epochs, period):
 使用Adadelta，最终学到的参数值与真实值较接近。

 ```{.python .input  n=3}
-train(batch_size=10, rho=0.9999, epochs=3, period=10)
+optimize(batch_size=10, rho=0.9999, num_epochs=3, log_interval=10)
 ```

 ## 结论

--- a/chapter_optimization/adagrad-gluon.md
+++ b/chapter_optimization/adagrad-gluon.md
@@ -7,12 +7,12 @@
 import mxnet as mx
 from mxnet import autograd
 from mxnet import gluon
-from mxnet import ndarray as nd
-import numpy as np
+from mxnet import nd
 import random

-mx.random.seed(1)
+# 为方便比较同一优化算法的从零开始实现和Gluon实现，将输出保持确定。
 random.seed(1)
+mx.random.seed(1)

 # 生成数据集。
 num_inputs = 2
@@ -22,55 +22,31 @@ true_b = 4.2
 X = nd.random_normal(scale=1, shape=(num_examples, num_inputs))
 y = true_w[0] * X[:, 0] + true_w[1] * X[:, 1] + true_b
 y += .01 * nd.random_normal(scale=1, shape=y.shape)
-dataset = gluon.data.ArrayDataset(X, y)

+# 创建模型和定义损失函数。
 net = gluon.nn.Sequential()
 net.add(gluon.nn.Dense(1))
-square_loss = gluon.loss.L2Loss()
 ```

 我们需要在`gluon.Trainer`中指定优化算法名称`adagrad`并设置参数。例如设置初始学习率`learning_rate`。

 ```{.python .input  n=2}
 %matplotlib inline
-import matplotlib as mpl
-mpl.rcParams['figure.dpi']= 120
-import matplotlib.pyplot as plt
-
-def train(batch_size, lr, epochs, period):
-    assert period >= batch_size and period % batch_size == 0
-    net.collect_params().initialize(mx.init.Normal(sigma=1), force_reinit=True)
-    # Adagrad。
-    trainer = gluon.Trainer(net.collect_params(), 'adagrad',
-                            {'learning_rate': lr})
-    data_iter = gluon.data.DataLoader(dataset, batch_size, shuffle=True)
-    total_loss = [np.mean(square_loss(net(X), y).asnumpy())]
-    
-    for epoch in range(1, epochs + 1):
-        for batch_i, (data, label) in enumerate(data_iter):
-            with autograd.record():
-                output = net(data)
-                loss = square_loss(output, label)
-            loss.backward()
-            trainer.step(batch_size)
-            if batch_i * batch_size % period == 0:
-                total_loss.append(np.mean(square_loss(net(X), y).asnumpy()))
-        print("Batch size %d, Learning rate %f, Epoch %d, loss %.4e" % 
-              (batch_size, trainer.learning_rate, epoch, total_loss[-1]))
-
-    print('w:', np.reshape(net[0].weight.data().asnumpy(), (1, -1)), 
-          'b:', net[0].bias.data().asnumpy()[0], '\n')
-    x_axis = np.linspace(0, epochs, len(total_loss), endpoint=True)
-    plt.semilogy(x_axis, total_loss)
-    plt.xlabel('epoch')
-    plt.ylabel('loss')
-    plt.show()
+%config InlineBackend.figure_format = 'retina'
+import numpy as np
+import sys
+sys.path.append('..')
+import utils
 ```

 使用Adagrad，最终学到的参数值与真实值较接近。

 ```{.python .input  n=3}
-train(batch_size=10, lr=0.9, epochs=3, period=10)
+net.collect_params().initialize(mx.init.Normal(sigma=1), force_reinit=True)
+trainer = gluon.Trainer(net.collect_params(), 'adagrad',
+                        {'learning_rate': 0.9})
+utils.optimize(batch_size=10, trainer=trainer, num_epochs=3, decay_epoch=None,
+               log_interval=10, X=X, y=y, net=net)
 ```

 ## 结论

--- a/chapter_optimization/adagrad-scratch.md
+++ b/chapter_optimization/adagrad-scratch.md
@@ -64,12 +64,13 @@ def adagrad(params, sqrs, lr, batch_size):
 实验中，我们以线性回归为例。其中真实参数`w`为[2, -3.4]，`b`为4.2。我们把梯度按元素平方的累加变量初始化为和参数形状相同的零张量。

 ```{.python .input  n=2}
-from mxnet import ndarray as nd
 import mxnet as mx
 from mxnet import autograd
 from mxnet import gluon
+from mxnet import nd
 import random

+# 为方便比较同一优化算法的从零开始实现和Gluon实现，将输出保持确定。
 mx.random.seed(1)
 random.seed(1)

@@ -81,16 +82,6 @@ true_b = 4.2
 X = nd.random_normal(scale=1, shape=(num_examples, num_inputs))
 y = true_w[0] * X[:, 0] + true_w[1] * X[:, 1] + true_b
 y += .01 * nd.random_normal(scale=1, shape=y.shape)
-dataset = gluon.data.ArrayDataset(X, y)
-
-# 构造迭代器。
-import random
-def data_iter(batch_size):
-    idx = list(range(num_examples))
-    random.shuffle(idx)
-    for batch_i, i in enumerate(range(0, num_examples, batch_size)):
-        j = nd.array(idx[i: min(i + batch_size, num_examples)])
-        yield batch_i, X.take(j), y.take(j)

 # 初始化模型参数。
 def init_params():
@@ -103,14 +94,6 @@ def init_params():
        # 把梯度按元素平方的累加变量初始化为和参数形状相同的零张量。
        sqrs.append(param.zeros_like())
    return params, sqrs
-
-# 线性回归模型。
-def net(X, w, b):
-    return nd.dot(X, w) + b
-
-# 损失函数。
-def square_loss(yhat, y):
-    return (yhat - y.reshape(yhat.shape)) ** 2 / 2
 ```

 接下来定义训练函数。训练函数的period参数说明，每次采样过该数目的数据点后，记录当前目标函数值用于作图。例如，当period和batch_size都为10时，每次迭代后均会记录目标函数值。
@@ -119,32 +102,40 @@ def square_loss(yhat, y):

 ```{.python .input  n=3}
 %matplotlib inline
+%config InlineBackend.figure_format = 'retina'
 import matplotlib as mpl
-mpl.rcParams['figure.dpi']= 120
 import matplotlib.pyplot as plt
 import numpy as np

-def train(batch_size, lr, epochs, period):
-    assert period >= batch_size and period % batch_size == 0
-    [w, b], sqrs = init_params()
-    total_loss = [np.mean(square_loss(net(X, w, b), y).asnumpy())]
+import sys
+sys.path.append('..')
+import utils
+
+net = utils.linreg
+squared_loss = utils.squared_loss

-    # 注意epoch从1开始计数。
-    for epoch in range(1, epochs + 1):
-        for batch_i, data, label in data_iter(batch_size):
+def optimize(batch_size, lr, num_epochs, log_interval):
+    [w, b], sqrs = init_params()
+    y_vals = [nd.mean(squared_loss(net(X, w, b), y)).asnumpy()]
+    print('batch size', batch_size)
+    for epoch in range(1, num_epochs + 1):
+        for batch_i, features, label in utils.data_iter(
+            batch_size, num_examples, random, X, y):
            with autograd.record():
-                output = net(data, w, b)
-                loss = square_loss(output, label)
+                output = net(features, w, b)
+                loss = squared_loss(output, label)
            loss.backward()
            adagrad([w, b], sqrs, lr, batch_size)
-            if batch_i * batch_size % period == 0:
-                total_loss.append(np.mean(square_loss(net(X, w, b), y).asnumpy()))
-        print("Batch size %d, Learning rate %f, Epoch %d, loss %.4e" % 
-              (batch_size, lr, epoch, total_loss[-1]))
+            if batch_i * batch_size % log_interval == 0:
+                y_vals.append(
+                    nd.mean(squared_loss(net(X, w, b), y)).asnumpy())
+        print('epoch %d, learning rate %f, loss %.4e' % 
+              (epoch, lr, y_vals[-1]))
    print('w:', np.reshape(w.asnumpy(), (1, -1)), 
          'b:', b.asnumpy()[0], '\n')
-    x_axis = np.linspace(0, epochs, len(total_loss), endpoint=True)
-    plt.semilogy(x_axis, total_loss)
+    x_vals = np.linspace(0, num_epochs, len(y_vals), endpoint=True)
+    utils.set_fig_size(mpl)
+    plt.semilogy(x_vals, y_vals)
    plt.xlabel('epoch')
    plt.ylabel('loss')
    plt.show()
@@ -153,7 +144,7 @@ def train(batch_size, lr, epochs, period):
 使用Adagrad，最终学到的参数值与真实值较接近。

 ```{.python .input  n=4}
-train(batch_size=10, lr=0.9, epochs=3, period=10)
+optimize(batch_size=10, lr=0.9, num_epochs=3, log_interval=10)
 ```

 ## 结论

--- a/chapter_optimization/adam-gluon.md
+++ b/chapter_optimization/adam-gluon.md
@@ -7,12 +7,12 @@
 import mxnet as mx
 from mxnet import autograd
 from mxnet import gluon
-from mxnet import ndarray as nd
-import numpy as np
+from mxnet import nd
 import random

-mx.random.seed(1)
+# 为方便比较同一优化算法的从零开始实现和Gluon实现，将输出保持确定。
 random.seed(1)
+mx.random.seed(1)

 # 生成数据集。
 num_inputs = 2
@@ -22,55 +22,31 @@ true_b = 4.2
 X = nd.random_normal(scale=1, shape=(num_examples, num_inputs))
 y = true_w[0] * X[:, 0] + true_w[1] * X[:, 1] + true_b
 y += .01 * nd.random_normal(scale=1, shape=y.shape)
-dataset = gluon.data.ArrayDataset(X, y)

+# 创建模型和定义损失函数。
 net = gluon.nn.Sequential()
 net.add(gluon.nn.Dense(1))
-square_loss = gluon.loss.L2Loss()
 ```

 我们需要在`gluon.Trainer`中指定优化算法名称`adam`并设置学习率。

 ```{.python .input  n=2}
 %matplotlib inline
-import matplotlib as mpl
-mpl.rcParams['figure.dpi']= 120
-import matplotlib.pyplot as plt
-
-def train(batch_size, lr, epochs, period):
-    assert period >= batch_size and period % batch_size == 0
-    net.collect_params().initialize(mx.init.Normal(sigma=1), force_reinit=True)
-    # Adam。
-    trainer = gluon.Trainer(net.collect_params(), 'adam',
-                            {'learning_rate': lr})
-    data_iter = gluon.data.DataLoader(dataset, batch_size, shuffle=True)
-    total_loss = [np.mean(square_loss(net(X), y).asnumpy())]
-
-    for epoch in range(1, epochs + 1):
-        for batch_i, (data, label) in enumerate(data_iter):
-            with autograd.record():
-                output = net(data)
-                loss = square_loss(output, label)
-            loss.backward()
-            trainer.step(batch_size)
-            if batch_i * batch_size % period == 0:
-                total_loss.append(np.mean(square_loss(net(X), y).asnumpy()))
-        print("Batch size %d, Learning rate %f, Epoch %d, loss %.4e" %
-              (batch_size, trainer.learning_rate, epoch, total_loss[-1]))
-
-    print('w:', np.reshape(net[0].weight.data().asnumpy(), (1, -1)),
-          'b:', net[0].bias.data().asnumpy()[0], '\n')
-    x_axis = np.linspace(0, epochs, len(total_loss), endpoint=True)
-    plt.semilogy(x_axis, total_loss)
-    plt.xlabel('epoch')
-    plt.ylabel('loss')
-    plt.show()
+%config InlineBackend.figure_format = 'retina'
+import numpy as np
+import sys
+sys.path.append('..')
+import utils
 ```

 使用Adam，最终学到的参数值与真实值较接近。

 ```{.python .input  n=3}
-train(batch_size=10, lr=0.1, epochs=3, period=10)
+net.collect_params().initialize(mx.init.Normal(sigma=1), force_reinit=True)
+trainer = gluon.Trainer(net.collect_params(), 'adam',
+                        {'learning_rate': 0.1})
+utils.optimize(batch_size=10, trainer=trainer, num_epochs=3, decay_epoch=None,
+               log_interval=10, X=X, y=y, net=net)
 ```

 ## 结论

--- a/chapter_optimization/adam-scratch.md
+++ b/chapter_optimization/adam-scratch.md
@@ -69,10 +69,11 @@ def adam(params, vs, sqrs, lr, batch_size, t):
 ```{.python .input  n=1}
 import mxnet as mx
 from mxnet import autograd
-from mxnet import ndarray as nd
 from mxnet import gluon
+from mxnet import nd
 import random

+# 为方便比较同一优化算法的从零开始实现和Gluon实现，将输出保持确定。
 mx.random.seed(1)
 random.seed(1)

@@ -84,16 +85,18 @@ true_b = 4.2
 X = nd.random_normal(scale=1, shape=(num_examples, num_inputs))
 y = true_w[0] * X[:, 0] + true_w[1] * X[:, 1] + true_b
 y += .01 * nd.random_normal(scale=1, shape=y.shape)
-dataset = gluon.data.ArrayDataset(X, y)

-# 构造迭代器。
-import random
-def data_iter(batch_size):
-    idx = list(range(num_examples))
-    random.shuffle(idx)
-    for batch_i, i in enumerate(range(0, num_examples, batch_size)):
-        j = nd.array(idx[i: min(i + batch_size, num_examples)])
-        yield batch_i, X.take(j), y.take(j)
+# 初始化模型参数。
+def init_params():
+    w = nd.random_normal(scale=1, shape=(num_inputs, 1))
+    b = nd.zeros(shape=(1,))
+    params = [w, b]
+    sqrs = []
+    for param in params:
+        param.attach_grad()
+        # 把梯度按元素平方的指数加权移动平均变量初始化为和参数形状相同的零张量。
+        sqrs.append(param.zeros_like())
+    return params, sqrs

 # 初始化模型参数。
 def init_params():
@@ -108,49 +111,50 @@ def init_params():
        vs.append(param.zeros_like())
        sqrs.append(param.zeros_like())
    return params, vs, sqrs
-
-# 线性回归模型。
-def net(X, w, b):
-    return nd.dot(X, w) + b
-
-# 损失函数。
-def square_loss(yhat, y):
-    return (yhat - y.reshape(yhat.shape)) ** 2 / 2
 ```

 接下来定义训练函数。当epoch大于2时（epoch从1开始计数），学习率以自乘0.1的方式自我衰减。训练函数的period参数说明，每次采样过该数目的数据点后，记录当前目标函数值用于作图。例如，当period和batch_size都为10时，每次迭代后均会记录目标函数值。

 ```{.python .input  n=2}
 %matplotlib inline
+%config InlineBackend.figure_format = 'retina'
 import matplotlib as mpl
-mpl.rcParams['figure.dpi']= 120
 import matplotlib.pyplot as plt
 import numpy as np

-def train(batch_size, lr, epochs, period):
-    assert period >= batch_size and period % batch_size == 0
-    [w, b], vs, sqrs = init_params()
-    total_loss = [np.mean(square_loss(net(X, w, b), y).asnumpy())]
+import sys
+sys.path.append('..')
+import utils
+
+net = utils.linreg
+squared_loss = utils.squared_loss

-    # 注意epoch从1开始计数。
+def optimize(batch_size, lr, num_epochs, log_interval):
+    [w, b], vs, sqrs = init_params()
+    y_vals = [nd.mean(squared_loss(net(X, w, b), y)).asnumpy()]
+    print('batch size', batch_size)
+    
    t = 0
-    for epoch in range(1, epochs + 1):
-        for batch_i, data, label in data_iter(batch_size):
+    for epoch in range(1, num_epochs + 1):
+        for batch_i, features, label in utils.data_iter(
+            batch_size, num_examples, random, X, y):
            with autograd.record():
-                output = net(data, w, b)
-                loss = square_loss(output, label)
+                output = net(features, w, b)
+                loss = squared_loss(output, label)
            loss.backward()
            # 必须在调用Adam前。
            t += 1
            adam([w, b], vs, sqrs, lr, batch_size, t)
-            if batch_i * batch_size % period == 0:
-                total_loss.append(np.mean(square_loss(net(X, w, b), y).asnumpy()))
-        print("Batch size %d, Learning rate %f, Epoch %d, loss %.4e" % 
-              (batch_size, lr, epoch, total_loss[-1]))
+            if batch_i * batch_size % log_interval == 0:
+                y_vals.append(
+                    nd.mean(squared_loss(net(X, w, b), y)).asnumpy())
+        print('epoch %d, learning rate %f, loss %.4e' % 
+              (epoch, lr, y_vals[-1]))
    print('w:', np.reshape(w.asnumpy(), (1, -1)), 
          'b:', b.asnumpy()[0], '\n')
-    x_axis = np.linspace(0, epochs, len(total_loss), endpoint=True)
-    plt.semilogy(x_axis, total_loss)
+    x_vals = np.linspace(0, num_epochs, len(y_vals), endpoint=True)
+    utils.set_fig_size(mpl)
+    plt.semilogy(x_vals, y_vals)
    plt.xlabel('epoch')
    plt.ylabel('loss')
    plt.show()
@@ -159,7 +163,7 @@ def train(batch_size, lr, epochs, period):
 使用Adam，最终学到的参数值与真实值较接近。

 ```{.python .input  n=3}
-train(batch_size=10, lr=0.1, epochs=3, period=10)
+optimize(batch_size=10, lr=0.1, num_epochs=3, log_interval=10)
 ```

 ## 结论

--- a/chapter_optimization/gd-sgd-gluon.md
+++ b/chapter_optimization/gd-sgd-gluon.md
@@ -22,7 +22,6 @@ X = nd.random_normal(scale=1, shape=(num_examples, num_inputs))
 y = true_w[0] * X[:, 0] + true_w[1] * X[:, 1] + true_b
 y += .01 * nd.random_normal(scale=1, shape=y.shape)

-
 # 创建模型和定义损失函数。
 net = gluon.nn.Sequential()
 net.add(gluon.nn.Dense(1))

--- a/chapter_optimization/gd-sgd-scratch.md
+++ b/chapter_optimization/gd-sgd-scratch.md
--- a/chapter_optimization/momentum-gluon.md
+++ b/chapter_optimization/momentum-gluon.md
@@ -7,12 +7,12 @@
 import mxnet as mx
 from mxnet import autograd
 from mxnet import gluon
-from mxnet import ndarray as nd
-import numpy as np
+from mxnet import nd
 import random

-mx.random.seed(1)
+# 为方便比较同一优化算法的从零开始实现和Gluon实现，将输出保持确定。
 random.seed(1)
+mx.random.seed(1)

 # 生成数据集。
 num_inputs = 2
@@ -22,59 +22,32 @@ true_b = 4.2
 X = nd.random_normal(scale=1, shape=(num_examples, num_inputs))
 y = true_w[0] * X[:, 0] + true_w[1] * X[:, 1] + true_b
 y += .01 * nd.random_normal(scale=1, shape=y.shape)
-dataset = gluon.data.ArrayDataset(X, y)

+
+# 创建模型和定义损失函数。
 net = gluon.nn.Sequential()
 net.add(gluon.nn.Dense(1))
-square_loss = gluon.loss.L2Loss()
 ```

 为了使学习率在两个epoch后自我衰减，我们需要访问`gluon.Trainer`的`learning_rate`属性和`set_learning_rate`函数。

-```{.python .input  n=2}
+```{.python .input}
 %matplotlib inline
-import matplotlib as mpl
-mpl.rcParams['figure.dpi']= 120
-import matplotlib.pyplot as plt
-
-def train(batch_size, lr, mom, epochs, period):
-    assert period >= batch_size and period % batch_size == 0
-    net.collect_params().initialize(mx.init.Normal(sigma=1), force_reinit=True)
-    # 动量法。
-    trainer = gluon.Trainer(net.collect_params(), 'sgd',
-                            {'learning_rate': lr, 'momentum': mom})
-    data_iter = gluon.data.DataLoader(dataset, batch_size, shuffle=True)
-    total_loss = [np.mean(square_loss(net(X), y).asnumpy())]
-    
-    for epoch in range(1, epochs + 1):
-        # 重设学习率。
-        if epoch > 2:
-            trainer.set_learning_rate(trainer.learning_rate * 0.1)
-        for batch_i, (data, label) in enumerate(data_iter):
-            with autograd.record():
-                output = net(data)
-                loss = square_loss(output, label)
-            loss.backward()
-            trainer.step(batch_size)
-
-            if batch_i * batch_size % period == 0:
-                total_loss.append(np.mean(square_loss(net(X), y).asnumpy()))
-        print("Batch size %d, Learning rate %f, Epoch %d, loss %.4e" % 
-              (batch_size, trainer.learning_rate, epoch, total_loss[-1]))
-
-    print('w:', np.reshape(net[0].weight.data().asnumpy(), (1, -1)), 
-          'b:', net[0].bias.data().asnumpy()[0], '\n')
-    x_axis = np.linspace(0, epochs, len(total_loss), endpoint=True)
-    plt.semilogy(x_axis, total_loss)
-    plt.xlabel('epoch')
-    plt.ylabel('loss')
-    plt.show()
+%config InlineBackend.figure_format = 'retina'
+import numpy as np
+import sys
+sys.path.append('..')
+import utils
 ```

 使用动量法，最终学到的参数值与真实值较接近。

 ```{.python .input  n=3}
-train(batch_size=10, lr=0.2, mom=0.9, epochs=3, period=10)
+net.collect_params().initialize(mx.init.Normal(sigma=1), force_reinit=True)
+trainer = gluon.Trainer(net.collect_params(), 'sgd',
+                        {'learning_rate': 0.2, 'momentum': 0.9})
+utils.optimize(batch_size=10, trainer=trainer, num_epochs=3, decay_epoch=2,
+               log_interval=10, X=X, y=y, net=net)
 ```

 ## 结论

--- a/chapter_optimization/momentum-scratch.md
+++ b/chapter_optimization/momentum-scratch.md
@@ -65,6 +65,7 @@ from mxnet import ndarray as nd
 from mxnet import gluon
 import random

+# 为方便比较同一优化算法的从零开始实现和Gluon实现，将输出保持确定。
 mx.random.seed(1)
 random.seed(1)

@@ -76,16 +77,6 @@ true_b = 4.2
 X = nd.random_normal(scale=1, shape=(num_examples, num_inputs))
 y = true_w[0] * X[:, 0] + true_w[1] * X[:, 1] + true_b
 y += .01 * nd.random_normal(scale=1, shape=y.shape)
-dataset = gluon.data.ArrayDataset(X, y)
-
-# 构造迭代器。
-import random
-def data_iter(batch_size):
-    idx = list(range(num_examples))
-    random.shuffle(idx)
-    for batch_i, i in enumerate(range(0, num_examples, batch_size)):
-        j = nd.array(idx[i: min(i + batch_size, num_examples)])
-        yield batch_i, X.take(j), y.take(j)

 # 初始化模型参数。
 def init_params():
@@ -98,49 +89,49 @@ def init_params():
        # 把速度项初始化为和参数形状相同的零张量。
        vs.append(param.zeros_like())
    return params, vs
-
-# 线性回归模型。
-def net(X, w, b):
-    return nd.dot(X, w) + b
-
-# 损失函数。
-def square_loss(yhat, y):
-    return (yhat - y.reshape(yhat.shape)) ** 2 / 2
 ```

 接下来定义训练函数。当epoch大于2时（epoch从1开始计数），学习率以自乘0.1的方式自我衰减。训练函数的period参数说明，每次采样过该数目的数据点后，记录当前目标函数值用于作图。例如，当period和batch_size都为10时，每次迭代后均会记录目标函数值。

 ```{.python .input  n=3}
 %matplotlib inline
+%config InlineBackend.figure_format = 'retina'
 import matplotlib as mpl
-mpl.rcParams['figure.dpi']= 120
 import matplotlib.pyplot as plt
 import numpy as np

-def train(batch_size, lr, mom, epochs, period):
-    assert period >= batch_size and period % batch_size == 0
-    [w, b], vs = init_params()
-    total_loss = [np.mean(square_loss(net(X, w, b), y).asnumpy())]
+import sys
+sys.path.append('..')
+import utils
+
+net = utils.linreg
+squared_loss = utils.squared_loss

-    # 注意epoch从1开始计数。
-    for epoch in range(1, epochs + 1):
-        # 重设学习率。
+def optimize(batch_size, lr, mom, num_epochs, log_interval):
+    [w, b], vs = init_params()
+    y_vals = [nd.mean(squared_loss(net(X, w, b), y)).asnumpy()]
+    print('batch size', batch_size)
+    for epoch in range(1, num_epochs + 1):
+        # 学习率自我衰减。
        if epoch > 2:
            lr *= 0.1
-        for batch_i, data, label in data_iter(batch_size):
+        for batch_i, features, label in utils.data_iter(
+            batch_size, num_examples, random, X, y):
            with autograd.record():
-                output = net(data, w, b)
-                loss = square_loss(output, label)
+                output = net(features, w, b)
+                loss = squared_loss(output, label)
            loss.backward()
            sgd_momentum([w, b], vs, lr, mom, batch_size)
-            if batch_i * batch_size % period == 0:
-                total_loss.append(np.mean(square_loss(net(X, w, b), y).asnumpy()))
-        print("Batch size %d, Learning rate %f, Epoch %d, loss %.4e" %
-              (batch_size, lr, epoch, total_loss[-1]))
-    print('w:', np.reshape(w.asnumpy(), (1, -1)),
+            if batch_i * batch_size % log_interval == 0:
+                y_vals.append(
+                    nd.mean(squared_loss(net(X, w, b), y)).asnumpy())
+        print('epoch %d, learning rate %f, loss %.4e' % 
+              (epoch, lr, y_vals[-1]))
+    print('w:', np.reshape(w.asnumpy(), (1, -1)), 
          'b:', b.asnumpy()[0], '\n')
-    x_axis = np.linspace(0, epochs, len(total_loss), endpoint=True)
-    plt.semilogy(x_axis, total_loss)
+    x_vals = np.linspace(0, num_epochs, len(y_vals), endpoint=True)
+    utils.set_fig_size(mpl)
+    plt.semilogy(x_vals, y_vals)
    plt.xlabel('epoch')
    plt.ylabel('loss')
    plt.show()
@@ -149,7 +140,7 @@ def train(batch_size, lr, mom, epochs, period):
 使用动量法，最终学到的参数值与真实值较接近。

 ```{.python .input  n=4}
-train(batch_size=10, lr=0.2, mom=0.9, epochs=3, period=10)
+optimize(batch_size=10, lr=0.2, mom=0.9, num_epochs=3, log_interval=10)
 ```

 ## 结论

--- a/chapter_optimization/rmsprop-gluon.md
+++ b/chapter_optimization/rmsprop-gluon.md
@@ -7,12 +7,12 @@
 import mxnet as mx
 from mxnet import autograd
 from mxnet import gluon
-from mxnet import ndarray as nd
-import numpy as np
+from mxnet import nd
 import random

-mx.random.seed(1)
+# 为方便比较同一优化算法的从零开始实现和Gluon实现，将输出保持确定。
 random.seed(1)
+mx.random.seed(1)

 # 生成数据集。
 num_inputs = 2
@@ -22,61 +22,41 @@ true_b = 4.2
 X = nd.random_normal(scale=1, shape=(num_examples, num_inputs))
 y = true_w[0] * X[:, 0] + true_w[1] * X[:, 1] + true_b
 y += .01 * nd.random_normal(scale=1, shape=y.shape)
-dataset = gluon.data.ArrayDataset(X, y)

+# 创建模型和定义损失函数。
 net = gluon.nn.Sequential()
 net.add(gluon.nn.Dense(1))
-square_loss = gluon.loss.L2Loss()
 ```

 我们需要在`gluon.Trainer`中指定优化算法名称`rmsprop`并设置参数。例如设置初始学习率`learning_rate`和指数加权移动平均中gamma1参数。

 ```{.python .input  n=2}
 %matplotlib inline
-import matplotlib as mpl
-mpl.rcParams['figure.dpi']= 120
-import matplotlib.pyplot as plt
-
-def train(batch_size, lr, gamma, epochs, period):
-    assert period >= batch_size and period % batch_size == 0
-    net.collect_params().initialize(mx.init.Normal(sigma=1), force_reinit=True)
-    # RMSProp。
-    trainer = gluon.Trainer(net.collect_params(), 'rmsprop',
-                            {'learning_rate': lr, 'gamma1': gamma})
-    data_iter = gluon.data.DataLoader(dataset, batch_size, shuffle=True)
-    total_loss = [np.mean(square_loss(net(X), y).asnumpy())]
-    
-    for epoch in range(1, epochs + 1):
-        for batch_i, (data, label) in enumerate(data_iter):
-            with autograd.record():
-                output = net(data)
-                loss = square_loss(output, label)
-            loss.backward()
-            trainer.step(batch_size)
-            if batch_i * batch_size % period == 0:
-                total_loss.append(np.mean(square_loss(net(X), y).asnumpy()))
-        print("Batch size %d, Learning rate %f, Epoch %d, loss %.4e" % 
-              (batch_size, trainer.learning_rate, epoch, total_loss[-1]))
-
-    print('w:', np.reshape(net[0].weight.data().asnumpy(), (1, -1)), 
-          'b:', net[0].bias.data().asnumpy()[0], '\n')
-    x_axis = np.linspace(0, epochs, len(total_loss), endpoint=True)
-    plt.semilogy(x_axis, total_loss)
-    plt.xlabel('epoch')
-    plt.ylabel('loss')
-    plt.show()
+%config InlineBackend.figure_format = 'retina'
+import numpy as np
+import sys
+sys.path.append('..')
+import utils
 ```

 我们将初始学习率设为0.03，并将gamma设为0.9。损失函数在迭代后期较震荡。

 ```{.python .input  n=3}
-train(batch_size=10, lr=0.03, gamma=0.9, epochs=3, period=10)
+net.collect_params().initialize(mx.init.Normal(sigma=1), force_reinit=True)
+trainer = gluon.Trainer(net.collect_params(), 'rmsprop',
+                        {'learning_rate': 0.03, 'gamma1': 0.9})
+utils.optimize(batch_size=10, trainer=trainer, num_epochs=3, decay_epoch=None,
+               log_interval=10, X=X, y=y, net=net)
 ```

 我们将gamma调大一点，例如0.999。这时损失函数在迭代后期较平滑。

 ```{.python .input}
-train(batch_size=10, lr=0.03, gamma=0.999, epochs=3, period=10)
+net.collect_params().initialize(mx.init.Normal(sigma=1), force_reinit=True)
+trainer = gluon.Trainer(net.collect_params(), 'rmsprop',
+                        {'learning_rate': 0.03, 'gamma1': 0.999})
+utils.optimize(batch_size=10, trainer=trainer, num_epochs=3, decay_epoch=None,
+               log_interval=10, X=X, y=y, net=net)
 ```

 ## 结论

--- a/chapter_optimization/rmsprop-scratch.md
+++ b/chapter_optimization/rmsprop-scratch.md
@@ -47,12 +47,13 @@ def rmsprop(params, sqrs, lr, gamma, batch_size):
 实验中，我们以线性回归为例。其中真实参数`w`为[2, -3.4]，`b`为4.2。我们把梯度按元素平方的指数加权移动平均变量初始化为和参数形状相同的零张量。

 ```{.python .input  n=1}
-from mxnet import ndarray as nd
 import mxnet as mx
 from mxnet import autograd
 from mxnet import gluon
+from mxnet import nd
 import random

+# 为方便比较同一优化算法的从零开始实现和Gluon实现，将输出保持确定。
 mx.random.seed(1)
 random.seed(1)

@@ -64,16 +65,6 @@ true_b = 4.2
 X = nd.random_normal(scale=1, shape=(num_examples, num_inputs))
 y = true_w[0] * X[:, 0] + true_w[1] * X[:, 1] + true_b
 y += .01 * nd.random_normal(scale=1, shape=y.shape)
-dataset = gluon.data.ArrayDataset(X, y)
-
-# 构造迭代器。
-import random
-def data_iter(batch_size):
-    idx = list(range(num_examples))
-    random.shuffle(idx)
-    for batch_i, i in enumerate(range(0, num_examples, batch_size)):
-        j = nd.array(idx[i: min(i + batch_size, num_examples)])
-        yield batch_i, X.take(j), y.take(j)

 # 初始化模型参数。
 def init_params():
@@ -86,46 +77,46 @@ def init_params():
        # 把梯度按元素平方的指数加权移动平均变量初始化为和参数形状相同的零张量。
        sqrs.append(param.zeros_like())
    return params, sqrs
-
-# 线性回归模型。
-def net(X, w, b):
-    return nd.dot(X, w) + b
-
-# 损失函数。
-def square_loss(yhat, y):
-    return (yhat - y.reshape(yhat.shape)) ** 2 / 2
 ```

 接下来定义训练函数。训练函数的period参数说明，每次采样过该数目的数据点后，记录当前目标函数值用于作图。例如，当period和batch_size都为10时，每次迭代后均会记录目标函数值。

 ```{.python .input  n=2}
 %matplotlib inline
+%config InlineBackend.figure_format = 'retina'
 import matplotlib as mpl
-mpl.rcParams['figure.dpi']= 120
 import matplotlib.pyplot as plt
 import numpy as np

-def train(batch_size, lr, gamma, epochs, period):
-    assert period >= batch_size and period % batch_size == 0
-    [w, b], sqrs = init_params()
-    total_loss = [np.mean(square_loss(net(X, w, b), y).asnumpy())]
+import sys
+sys.path.append('..')
+import utils
+
+net = utils.linreg
+squared_loss = utils.squared_loss

-    # 注意epoch从1开始计数。
-    for epoch in range(1, epochs + 1):
-        for batch_i, data, label in data_iter(batch_size):
+def optimize(batch_size, lr, gamma, num_epochs, log_interval):
+    [w, b], sqrs = init_params()
+    y_vals = [nd.mean(squared_loss(net(X, w, b), y)).asnumpy()]
+    print('batch size', batch_size)
+    for epoch in range(1, num_epochs + 1):
+        for batch_i, features, label in utils.data_iter(
+            batch_size, num_examples, random, X, y):
            with autograd.record():
-                output = net(data, w, b)
-                loss = square_loss(output, label)
+                output = net(features, w, b)
+                loss = squared_loss(output, label)
            loss.backward()
            rmsprop([w, b], sqrs, lr, gamma, batch_size)
-            if batch_i * batch_size % period == 0:
-                total_loss.append(np.mean(square_loss(net(X, w, b), y).asnumpy()))
-        print("Batch size %d, Learning rate %f, Epoch %d, loss %.4e" % 
-              (batch_size, lr, epoch, total_loss[-1]))
+            if batch_i * batch_size % log_interval == 0:
+                y_vals.append(
+                    nd.mean(squared_loss(net(X, w, b), y)).asnumpy())
+        print('epoch %d, learning rate %f, loss %.4e' % 
+              (epoch, lr, y_vals[-1]))
    print('w:', np.reshape(w.asnumpy(), (1, -1)), 
          'b:', b.asnumpy()[0], '\n')
-    x_axis = np.linspace(0, epochs, len(total_loss), endpoint=True)
-    plt.semilogy(x_axis, total_loss)
+    x_vals = np.linspace(0, num_epochs, len(y_vals), endpoint=True)
+    utils.set_fig_size(mpl)
+    plt.semilogy(x_vals, y_vals)
    plt.xlabel('epoch')
    plt.ylabel('loss')
    plt.show()
@@ -134,13 +125,13 @@ def train(batch_size, lr, gamma, epochs, period):
 我们将初始学习率设为0.03，并将gamma设为0.9。损失函数在迭代后期较震荡。

 ```{.python .input  n=3}
-train(batch_size=10, lr=0.03, gamma=0.9, epochs=3, period=10)
+optimize(batch_size=10, lr=0.03, gamma=0.9, num_epochs=3, log_interval=10)
 ```

 我们将gamma调大一点，例如0.999。这时损失函数在迭代后期较平滑。

 ```{.python .input}
-train(batch_size=10, lr=0.03, gamma=0.999, epochs=3, period=10)
+optimize(batch_size=10, lr=0.03, gamma=0.999, num_epochs=3, log_interval=10)
 ```

 ## 结论

--- a/utils.py
+++ b/utils.py
@@ -369,7 +369,7 @@ def squared_loss(yhat, y):


 def optimize(batch_size, trainer, num_epochs, decay_epoch, log_interval, X, y,
-             net):
+             net, print_lr=True):
    """优化目标函数。"""
    dataset = gluon.data.ArrayDataset(X, y)
    data_iter = gluon.data.DataLoader(dataset, batch_size, shuffle=True)
@@ -388,8 +388,11 @@ def optimize(batch_size, trainer, num_epochs, decay_epoch, log_interval, X, y,
            trainer.step(batch_size)
            if batch_i * batch_size % log_interval == 0:
                y_vals.append(nd.mean(square_loss(net(X), y)).asnumpy())
-        print("epoch %d, learning rate %f, loss %.4e" %
-              (epoch, trainer.learning_rate, y_vals[-1]))
+        if print_lr:
+            print("epoch %d, learning rate %f, loss %.4e" %
+                  (epoch, trainer.learning_rate, y_vals[-1]))
+        else:
+            print("epoch %d, loss %.4e" % (epoch, y_vals[-1]))
    print('w:', np.reshape(net[0].weight.data().asnumpy(), (1, -1)),
          'b:', net[0].bias.data().asnumpy()[0], '\n')
    x_vals = np.linspace(0, num_epochs, len(y_vals), endpoint=True)