Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
OpenDocCN
d2l-zh
提交
ae48f45b
D
d2l-zh
项目概览
OpenDocCN
/
d2l-zh
通知
2
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
D
d2l-zh
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
前往新版Gitcode,体验更适合开发者的 AI 搜索 >>
提交
ae48f45b
编写于
3月 30, 2018
作者:
A
Aston Zhang
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
refactor opt code
上级
bb73c921
变更
13
展开全部
隐藏空白更改
内联
并排
Showing
13 changed file
with
350 addition
and
382 deletion
+350
-382
chapter_optimization/adadelta-gluon.md
chapter_optimization/adadelta-gluon.md
+13
-39
chapter_optimization/adadelta-scratch.md
chapter_optimization/adadelta-scratch.md
+26
-36
chapter_optimization/adagrad-gluon.md
chapter_optimization/adagrad-gluon.md
+14
-38
chapter_optimization/adagrad-scratch.md
chapter_optimization/adagrad-scratch.md
+27
-36
chapter_optimization/adam-gluon.md
chapter_optimization/adam-gluon.md
+14
-38
chapter_optimization/adam-scratch.md
chapter_optimization/adam-scratch.md
+39
-35
chapter_optimization/gd-sgd-gluon.md
chapter_optimization/gd-sgd-gluon.md
+0
-1
chapter_optimization/gd-sgd-scratch.md
chapter_optimization/gd-sgd-scratch.md
+120
-0
chapter_optimization/momentum-gluon.md
chapter_optimization/momentum-gluon.md
+16
-43
chapter_optimization/momentum-scratch.md
chapter_optimization/momentum-scratch.md
+28
-37
chapter_optimization/rmsprop-gluon.md
chapter_optimization/rmsprop-gluon.md
+19
-39
chapter_optimization/rmsprop-scratch.md
chapter_optimization/rmsprop-scratch.md
+28
-37
utils.py
utils.py
+6
-3
未找到文件。
chapter_optimization/adadelta-gluon.md
浏览文件 @
ae48f45b
...
...
@@ -6,12 +6,12 @@
import mxnet as mx
from mxnet import autograd
from mxnet import gluon
from mxnet import ndarray as nd
import numpy as np
from mxnet import nd
import random
mx.random.seed(1)
# 为方便比较同一优化算法的从零开始实现和Gluon实现,将输出保持确定。
random.seed(1)
mx.random.seed(1)
# 生成数据集。
num_inputs = 2
...
...
@@ -21,56 +21,30 @@ true_b = 4.2
X = nd.random_normal(scale=1, shape=(num_examples, num_inputs))
y = true_w[0] * X[:, 0] + true_w[1] * X[:, 1] + true_b
y += .01 * nd.random_normal(scale=1, shape=y.shape)
dataset = gluon.data.ArrayDataset(X, y)
# 创建模型和定义损失函数。
net = gluon.nn.Sequential()
net.add(gluon.nn.Dense(1))
square_loss = gluon.loss.L2Loss()
```
我们需要在
`gluon.Trainer`
中指定优化算法名称
`adadelta`
并设置rho参数。
```
{.python .input n=2}
%matplotlib inline
import matplotlib as mpl
mpl.rcParams['figure.dpi']= 120
import matplotlib.pyplot as plt
def train(batch_size, rho, epochs, period):
assert period >= batch_size and period % batch_size == 0
net.collect_params().initialize(mx.init.Normal(sigma=1), force_reinit=True)
# Adadelta。
trainer = gluon.Trainer(net.collect_params(), 'adadelta',
{'rho': rho})
data_iter = gluon.data.DataLoader(dataset, batch_size, shuffle=True)
total_loss = [np.mean(square_loss(net(X), y).asnumpy())]
for epoch in range(1, epochs + 1):
for batch_i, (data, label) in enumerate(data_iter):
with autograd.record():
output = net(data)
loss = square_loss(output, label)
loss.backward()
trainer.step(batch_size)
if batch_i * batch_size % period == 0:
total_loss.append(np.mean(square_loss(net(X), y).asnumpy()))
print("Batch size %d, Epoch %d, loss %.4e" %
(batch_size, epoch, total_loss[-1]))
print('w:', np.reshape(net[0].weight.data().asnumpy(), (1, -1)),
'b:', net[0].bias.data().asnumpy()[0], '\n')
x_axis = np.linspace(0, epochs, len(total_loss), endpoint=True)
plt.semilogy(x_axis, total_loss)
plt.xlabel('epoch')
plt.ylabel('loss')
plt.show()
%config InlineBackend.figure_format = 'retina'
import numpy as np
import sys
sys.path.append('..')
import utils
```
使用Adadelta,最终学到的参数值与真实值较接近。
```
{.python .input n=3}
train(batch_size=10, rho=0.9999, epochs=3, period=10)
net.collect_params().initialize(mx.init.Normal(sigma=1), force_reinit=True)
trainer = gluon.Trainer(net.collect_params(), 'adadelta', {'rho': 0.9999})
utils.optimize(batch_size=10, trainer=trainer, num_epochs=3, decay_epoch=None,
log_interval=10, X=X, y=y, net=net, print_lr=False)
```
## 结论
...
...
chapter_optimization/adadelta-scratch.md
浏览文件 @
ae48f45b
...
...
@@ -47,12 +47,13 @@ def adadelta(params, sqrs, deltas, rho, batch_size):
实验中,我们以线性回归为例。其中真实参数
`w`
为[2, -3.4],
`b`
为4.2。我们把算法中基于指数加权移动平均的变量初始化为和参数形状相同的零张量。
```
{.python .input n=1}
from mxnet import ndarray as nd
import mxnet as mx
from mxnet import autograd
from mxnet import gluon
from mxnet import nd
import random
# 为方便比较同一优化算法的从零开始实现和Gluon实现,将输出保持确定。
mx.random.seed(1)
random.seed(1)
...
...
@@ -64,16 +65,6 @@ true_b = 4.2
X = nd.random_normal(scale=1, shape=(num_examples, num_inputs))
y = true_w[0] * X[:, 0] + true_w[1] * X[:, 1] + true_b
y += .01 * nd.random_normal(scale=1, shape=y.shape)
dataset = gluon.data.ArrayDataset(X, y)
# 构造迭代器。
import random
def data_iter(batch_size):
idx = list(range(num_examples))
random.shuffle(idx)
for batch_i, i in enumerate(range(0, num_examples, batch_size)):
j = nd.array(idx[i: min(i + batch_size, num_examples)])
yield batch_i, X.take(j), y.take(j)
# 初始化模型参数。
def init_params():
...
...
@@ -88,46 +79,45 @@ def init_params():
sqrs.append(param.zeros_like())
deltas.append(param.zeros_like())
return params, sqrs, deltas
# 线性回归模型。
def net(X, w, b):
return nd.dot(X, w) + b
# 损失函数。
def square_loss(yhat, y):
return (yhat - y.reshape(yhat.shape)) ** 2 / 2
```
接下来定义训练函数。当epoch大于2时(epoch从1开始计数),学习率以自乘0.1的方式自我衰减。训练函数的period参数说明,每次采样过该数目的数据点后,记录当前目标函数值用于作图。例如,当period和batch_size都为10时,每次迭代后均会记录目标函数值。
```
{.python .input n=2}
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
import matplotlib as mpl
mpl.rcParams['figure.dpi']= 120
import matplotlib.pyplot as plt
import numpy as np
def train(batch_size, rho, epochs, period):
assert period >= batch_size and period % batch_size == 0
[w, b], sqrs, deltas = init_params()
total_loss = [np.mean(square_loss(net(X, w, b), y).asnumpy())]
import sys
sys.path.append('..')
import utils
net = utils.linreg
squared_loss = utils.squared_loss
# 注意epoch从1开始计数。
for epoch in range(1, epochs + 1):
for batch_i, data, label in data_iter(batch_size):
def optimize(batch_size, rho, num_epochs, log_interval):
[w, b], sqrs, deltas = init_params()
y_vals = [nd.mean(squared_loss(net(X, w, b), y)).asnumpy()]
print('batch size', batch_size)
for epoch in range(1, num_epochs + 1):
for batch_i, features, label in utils.data_iter(
batch_size, num_examples, random, X, y):
with autograd.record():
output = net(
data
, w, b)
loss = square_loss(output, label)
output = net(
features
, w, b)
loss = square
d
_loss(output, label)
loss.backward()
adadelta([w, b], sqrs, deltas, rho, batch_size)
if batch_i * batch_size %
period
== 0:
total_loss.append(np.mean(square_loss(net(X, w, b), y).asnumpy()))
print("Batch size %d, Epoch %d, loss %.4e" %
(batch_size, epoch, total_los
s[-1]))
if batch_i * batch_size %
log_interval
== 0:
y_vals.append(
nd.mean(squared_loss(net(X, w, b), y)).asnumpy())
print('epoch %d, loss %.4e' % (epoch, y_val
s[-1]))
print('w:', np.reshape(w.asnumpy(), (1, -1)),
'b:', b.asnumpy()[0], '\n')
x_axis = np.linspace(0, epochs, len(total_loss), endpoint=True)
plt.semilogy(x_axis, total_loss)
x_vals = np.linspace(0, num_epochs, len(y_vals), endpoint=True)
utils.set_fig_size(mpl)
plt.semilogy(x_vals, y_vals)
plt.xlabel('epoch')
plt.ylabel('loss')
plt.show()
...
...
@@ -136,7 +126,7 @@ def train(batch_size, rho, epochs, period):
使用Adadelta,最终学到的参数值与真实值较接近。
```
{.python .input n=3}
train(batch_size=10, rho=0.9999, epochs=3, period
=10)
optimize(batch_size=10, rho=0.9999, num_epochs=3, log_interval
=10)
```
## 结论
...
...
chapter_optimization/adagrad-gluon.md
浏览文件 @
ae48f45b
...
...
@@ -7,12 +7,12 @@
import mxnet as mx
from mxnet import autograd
from mxnet import gluon
from mxnet import ndarray as nd
import numpy as np
from mxnet import nd
import random
mx.random.seed(1)
# 为方便比较同一优化算法的从零开始实现和Gluon实现,将输出保持确定。
random.seed(1)
mx.random.seed(1)
# 生成数据集。
num_inputs = 2
...
...
@@ -22,55 +22,31 @@ true_b = 4.2
X = nd.random_normal(scale=1, shape=(num_examples, num_inputs))
y = true_w[0] * X[:, 0] + true_w[1] * X[:, 1] + true_b
y += .01 * nd.random_normal(scale=1, shape=y.shape)
dataset = gluon.data.ArrayDataset(X, y)
# 创建模型和定义损失函数。
net = gluon.nn.Sequential()
net.add(gluon.nn.Dense(1))
square_loss = gluon.loss.L2Loss()
```
我们需要在
`gluon.Trainer`
中指定优化算法名称
`adagrad`
并设置参数。例如设置初始学习率
`learning_rate`
。
```
{.python .input n=2}
%matplotlib inline
import matplotlib as mpl
mpl.rcParams['figure.dpi']= 120
import matplotlib.pyplot as plt
def train(batch_size, lr, epochs, period):
assert period >= batch_size and period % batch_size == 0
net.collect_params().initialize(mx.init.Normal(sigma=1), force_reinit=True)
# Adagrad。
trainer = gluon.Trainer(net.collect_params(), 'adagrad',
{'learning_rate': lr})
data_iter = gluon.data.DataLoader(dataset, batch_size, shuffle=True)
total_loss = [np.mean(square_loss(net(X), y).asnumpy())]
for epoch in range(1, epochs + 1):
for batch_i, (data, label) in enumerate(data_iter):
with autograd.record():
output = net(data)
loss = square_loss(output, label)
loss.backward()
trainer.step(batch_size)
if batch_i * batch_size % period == 0:
total_loss.append(np.mean(square_loss(net(X), y).asnumpy()))
print("Batch size %d, Learning rate %f, Epoch %d, loss %.4e" %
(batch_size, trainer.learning_rate, epoch, total_loss[-1]))
print('w:', np.reshape(net[0].weight.data().asnumpy(), (1, -1)),
'b:', net[0].bias.data().asnumpy()[0], '\n')
x_axis = np.linspace(0, epochs, len(total_loss), endpoint=True)
plt.semilogy(x_axis, total_loss)
plt.xlabel('epoch')
plt.ylabel('loss')
plt.show()
%config InlineBackend.figure_format = 'retina'
import numpy as np
import sys
sys.path.append('..')
import utils
```
使用Adagrad,最终学到的参数值与真实值较接近。
```
{.python .input n=3}
train(batch_size=10, lr=0.9, epochs=3, period=10)
net.collect_params().initialize(mx.init.Normal(sigma=1), force_reinit=True)
trainer = gluon.Trainer(net.collect_params(), 'adagrad',
{'learning_rate': 0.9})
utils.optimize(batch_size=10, trainer=trainer, num_epochs=3, decay_epoch=None,
log_interval=10, X=X, y=y, net=net)
```
## 结论
...
...
chapter_optimization/adagrad-scratch.md
浏览文件 @
ae48f45b
...
...
@@ -64,12 +64,13 @@ def adagrad(params, sqrs, lr, batch_size):
实验中,我们以线性回归为例。其中真实参数
`w`
为[2, -3.4],
`b`
为4.2。我们把梯度按元素平方的累加变量初始化为和参数形状相同的零张量。
```
{.python .input n=2}
from mxnet import ndarray as nd
import mxnet as mx
from mxnet import autograd
from mxnet import gluon
from mxnet import nd
import random
# 为方便比较同一优化算法的从零开始实现和Gluon实现,将输出保持确定。
mx.random.seed(1)
random.seed(1)
...
...
@@ -81,16 +82,6 @@ true_b = 4.2
X = nd.random_normal(scale=1, shape=(num_examples, num_inputs))
y = true_w[0] * X[:, 0] + true_w[1] * X[:, 1] + true_b
y += .01 * nd.random_normal(scale=1, shape=y.shape)
dataset = gluon.data.ArrayDataset(X, y)
# 构造迭代器。
import random
def data_iter(batch_size):
idx = list(range(num_examples))
random.shuffle(idx)
for batch_i, i in enumerate(range(0, num_examples, batch_size)):
j = nd.array(idx[i: min(i + batch_size, num_examples)])
yield batch_i, X.take(j), y.take(j)
# 初始化模型参数。
def init_params():
...
...
@@ -103,14 +94,6 @@ def init_params():
# 把梯度按元素平方的累加变量初始化为和参数形状相同的零张量。
sqrs.append(param.zeros_like())
return params, sqrs
# 线性回归模型。
def net(X, w, b):
return nd.dot(X, w) + b
# 损失函数。
def square_loss(yhat, y):
return (yhat - y.reshape(yhat.shape)) ** 2 / 2
```
接下来定义训练函数。训练函数的period参数说明,每次采样过该数目的数据点后,记录当前目标函数值用于作图。例如,当period和batch_size都为10时,每次迭代后均会记录目标函数值。
...
...
@@ -119,32 +102,40 @@ def square_loss(yhat, y):
```
{.python .input n=3}
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
import matplotlib as mpl
mpl.rcParams['figure.dpi']= 120
import matplotlib.pyplot as plt
import numpy as np
def train(batch_size, lr, epochs, period):
assert period >= batch_size and period % batch_size == 0
[w, b], sqrs = init_params()
total_loss = [np.mean(square_loss(net(X, w, b), y).asnumpy())]
import sys
sys.path.append('..')
import utils
net = utils.linreg
squared_loss = utils.squared_loss
# 注意epoch从1开始计数。
for epoch in range(1, epochs + 1):
for batch_i, data, label in data_iter(batch_size):
def optimize(batch_size, lr, num_epochs, log_interval):
[w, b], sqrs = init_params()
y_vals = [nd.mean(squared_loss(net(X, w, b), y)).asnumpy()]
print('batch size', batch_size)
for epoch in range(1, num_epochs + 1):
for batch_i, features, label in utils.data_iter(
batch_size, num_examples, random, X, y):
with autograd.record():
output = net(
data
, w, b)
loss = square_loss(output, label)
output = net(
features
, w, b)
loss = square
d
_loss(output, label)
loss.backward()
adagrad([w, b], sqrs, lr, batch_size)
if batch_i * batch_size % period == 0:
total_loss.append(np.mean(square_loss(net(X, w, b), y).asnumpy()))
print("Batch size %d, Learning rate %f, Epoch %d, loss %.4e" %
(batch_size, lr, epoch, total_loss[-1]))
if batch_i * batch_size % log_interval == 0:
y_vals.append(
nd.mean(squared_loss(net(X, w, b), y)).asnumpy())
print('epoch %d, learning rate %f, loss %.4e' %
(epoch, lr, y_vals[-1]))
print('w:', np.reshape(w.asnumpy(), (1, -1)),
'b:', b.asnumpy()[0], '\n')
x_axis = np.linspace(0, epochs, len(total_loss), endpoint=True)
plt.semilogy(x_axis, total_loss)
x_vals = np.linspace(0, num_epochs, len(y_vals), endpoint=True)
utils.set_fig_size(mpl)
plt.semilogy(x_vals, y_vals)
plt.xlabel('epoch')
plt.ylabel('loss')
plt.show()
...
...
@@ -153,7 +144,7 @@ def train(batch_size, lr, epochs, period):
使用Adagrad,最终学到的参数值与真实值较接近。
```
{.python .input n=4}
train(batch_size=10, lr=0.9, epochs=3, period
=10)
optimize(batch_size=10, lr=0.9, num_epochs=3, log_interval
=10)
```
## 结论
...
...
chapter_optimization/adam-gluon.md
浏览文件 @
ae48f45b
...
...
@@ -7,12 +7,12 @@
import mxnet as mx
from mxnet import autograd
from mxnet import gluon
from mxnet import ndarray as nd
import numpy as np
from mxnet import nd
import random
mx.random.seed(1)
# 为方便比较同一优化算法的从零开始实现和Gluon实现,将输出保持确定。
random.seed(1)
mx.random.seed(1)
# 生成数据集。
num_inputs = 2
...
...
@@ -22,55 +22,31 @@ true_b = 4.2
X = nd.random_normal(scale=1, shape=(num_examples, num_inputs))
y = true_w[0] * X[:, 0] + true_w[1] * X[:, 1] + true_b
y += .01 * nd.random_normal(scale=1, shape=y.shape)
dataset = gluon.data.ArrayDataset(X, y)
# 创建模型和定义损失函数。
net = gluon.nn.Sequential()
net.add(gluon.nn.Dense(1))
square_loss = gluon.loss.L2Loss()
```
我们需要在
`gluon.Trainer`
中指定优化算法名称
`adam`
并设置学习率。
```
{.python .input n=2}
%matplotlib inline
import matplotlib as mpl
mpl.rcParams['figure.dpi']= 120
import matplotlib.pyplot as plt
def train(batch_size, lr, epochs, period):
assert period >= batch_size and period % batch_size == 0
net.collect_params().initialize(mx.init.Normal(sigma=1), force_reinit=True)
# Adam。
trainer = gluon.Trainer(net.collect_params(), 'adam',
{'learning_rate': lr})
data_iter = gluon.data.DataLoader(dataset, batch_size, shuffle=True)
total_loss = [np.mean(square_loss(net(X), y).asnumpy())]
for epoch in range(1, epochs + 1):
for batch_i, (data, label) in enumerate(data_iter):
with autograd.record():
output = net(data)
loss = square_loss(output, label)
loss.backward()
trainer.step(batch_size)
if batch_i * batch_size % period == 0:
total_loss.append(np.mean(square_loss(net(X), y).asnumpy()))
print("Batch size %d, Learning rate %f, Epoch %d, loss %.4e" %
(batch_size, trainer.learning_rate, epoch, total_loss[-1]))
print('w:', np.reshape(net[0].weight.data().asnumpy(), (1, -1)),
'b:', net[0].bias.data().asnumpy()[0], '\n')
x_axis = np.linspace(0, epochs, len(total_loss), endpoint=True)
plt.semilogy(x_axis, total_loss)
plt.xlabel('epoch')
plt.ylabel('loss')
plt.show()
%config InlineBackend.figure_format = 'retina'
import numpy as np
import sys
sys.path.append('..')
import utils
```
使用Adam,最终学到的参数值与真实值较接近。
```
{.python .input n=3}
train(batch_size=10, lr=0.1, epochs=3, period=10)
net.collect_params().initialize(mx.init.Normal(sigma=1), force_reinit=True)
trainer = gluon.Trainer(net.collect_params(), 'adam',
{'learning_rate': 0.1})
utils.optimize(batch_size=10, trainer=trainer, num_epochs=3, decay_epoch=None,
log_interval=10, X=X, y=y, net=net)
```
## 结论
...
...
chapter_optimization/adam-scratch.md
浏览文件 @
ae48f45b
...
...
@@ -69,10 +69,11 @@ def adam(params, vs, sqrs, lr, batch_size, t):
```
{.python .input n=1}
import mxnet as mx
from mxnet import autograd
from mxnet import ndarray as nd
from mxnet import gluon
from mxnet import nd
import random
# 为方便比较同一优化算法的从零开始实现和Gluon实现,将输出保持确定。
mx.random.seed(1)
random.seed(1)
...
...
@@ -84,16 +85,18 @@ true_b = 4.2
X = nd.random_normal(scale=1, shape=(num_examples, num_inputs))
y = true_w[0] * X[:, 0] + true_w[1] * X[:, 1] + true_b
y += .01 * nd.random_normal(scale=1, shape=y.shape)
dataset = gluon.data.ArrayDataset(X, y)
# 构造迭代器。
import random
def data_iter(batch_size):
idx = list(range(num_examples))
random.shuffle(idx)
for batch_i, i in enumerate(range(0, num_examples, batch_size)):
j = nd.array(idx[i: min(i + batch_size, num_examples)])
yield batch_i, X.take(j), y.take(j)
# 初始化模型参数。
def init_params():
w = nd.random_normal(scale=1, shape=(num_inputs, 1))
b = nd.zeros(shape=(1,))
params = [w, b]
sqrs = []
for param in params:
param.attach_grad()
# 把梯度按元素平方的指数加权移动平均变量初始化为和参数形状相同的零张量。
sqrs.append(param.zeros_like())
return params, sqrs
# 初始化模型参数。
def init_params():
...
...
@@ -108,49 +111,50 @@ def init_params():
vs.append(param.zeros_like())
sqrs.append(param.zeros_like())
return params, vs, sqrs
# 线性回归模型。
def net(X, w, b):
return nd.dot(X, w) + b
# 损失函数。
def square_loss(yhat, y):
return (yhat - y.reshape(yhat.shape)) ** 2 / 2
```
接下来定义训练函数。当epoch大于2时(epoch从1开始计数),学习率以自乘0.1的方式自我衰减。训练函数的period参数说明,每次采样过该数目的数据点后,记录当前目标函数值用于作图。例如,当period和batch_size都为10时,每次迭代后均会记录目标函数值。
```
{.python .input n=2}
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
import matplotlib as mpl
mpl.rcParams['figure.dpi']= 120
import matplotlib.pyplot as plt
import numpy as np
def train(batch_size, lr, epochs, period):
assert period >= batch_size and period % batch_size == 0
[w, b], vs, sqrs = init_params()
total_loss = [np.mean(square_loss(net(X, w, b), y).asnumpy())]
import sys
sys.path.append('..')
import utils
net = utils.linreg
squared_loss = utils.squared_loss
# 注意epoch从1开始计数。
def optimize(batch_size, lr, num_epochs, log_interval):
[w, b], vs, sqrs = init_params()
y_vals = [nd.mean(squared_loss(net(X, w, b), y)).asnumpy()]
print('batch size', batch_size)
t = 0
for epoch in range(1, epochs + 1):
for batch_i, data, label in data_iter(batch_size):
for epoch in range(1, num_epochs + 1):
for batch_i, features, label in utils.data_iter(
batch_size, num_examples, random, X, y):
with autograd.record():
output = net(
data
, w, b)
loss = square_loss(output, label)
output = net(
features
, w, b)
loss = square
d
_loss(output, label)
loss.backward()
# 必须在调用Adam前。
t += 1
adam([w, b], vs, sqrs, lr, batch_size, t)
if batch_i * batch_size % period == 0:
total_loss.append(np.mean(square_loss(net(X, w, b), y).asnumpy()))
print("Batch size %d, Learning rate %f, Epoch %d, loss %.4e" %
(batch_size, lr, epoch, total_loss[-1]))
if batch_i * batch_size % log_interval == 0:
y_vals.append(
nd.mean(squared_loss(net(X, w, b), y)).asnumpy())
print('epoch %d, learning rate %f, loss %.4e' %
(epoch, lr, y_vals[-1]))
print('w:', np.reshape(w.asnumpy(), (1, -1)),
'b:', b.asnumpy()[0], '\n')
x_axis = np.linspace(0, epochs, len(total_loss), endpoint=True)
plt.semilogy(x_axis, total_loss)
x_vals = np.linspace(0, num_epochs, len(y_vals), endpoint=True)
utils.set_fig_size(mpl)
plt.semilogy(x_vals, y_vals)
plt.xlabel('epoch')
plt.ylabel('loss')
plt.show()
...
...
@@ -159,7 +163,7 @@ def train(batch_size, lr, epochs, period):
使用Adam,最终学到的参数值与真实值较接近。
```
{.python .input n=3}
train(batch_size=10, lr=0.1, epochs=3, period
=10)
optimize(batch_size=10, lr=0.1, num_epochs=3, log_interval
=10)
```
## 结论
...
...
chapter_optimization/gd-sgd-gluon.md
浏览文件 @
ae48f45b
...
...
@@ -22,7 +22,6 @@ X = nd.random_normal(scale=1, shape=(num_examples, num_inputs))
y = true_w[0] * X[:, 0] + true_w[1] * X[:, 1] + true_b
y += .01 * nd.random_normal(scale=1, shape=y.shape)
# 创建模型和定义损失函数。
net = gluon.nn.Sequential()
net.add(gluon.nn.Dense(1))
...
...
chapter_optimization/gd-sgd-scratch.md
浏览文件 @
ae48f45b
此差异已折叠。
点击以展开。
chapter_optimization/momentum-gluon.md
浏览文件 @
ae48f45b
...
...
@@ -7,12 +7,12 @@
import mxnet as mx
from mxnet import autograd
from mxnet import gluon
from mxnet import ndarray as nd
import numpy as np
from mxnet import nd
import random
mx.random.seed(1)
# 为方便比较同一优化算法的从零开始实现和Gluon实现,将输出保持确定。
random.seed(1)
mx.random.seed(1)
# 生成数据集。
num_inputs = 2
...
...
@@ -22,59 +22,32 @@ true_b = 4.2
X = nd.random_normal(scale=1, shape=(num_examples, num_inputs))
y = true_w[0] * X[:, 0] + true_w[1] * X[:, 1] + true_b
y += .01 * nd.random_normal(scale=1, shape=y.shape)
dataset = gluon.data.ArrayDataset(X, y)
# 创建模型和定义损失函数。
net = gluon.nn.Sequential()
net.add(gluon.nn.Dense(1))
square_loss = gluon.loss.L2Loss()
```
为了使学习率在两个epoch后自我衰减,我们需要访问
`gluon.Trainer`
的
`learning_rate`
属性和
`set_learning_rate`
函数。
```
{.python .input
n=2
}
```
{.python .input}
%matplotlib inline
import matplotlib as mpl
mpl.rcParams['figure.dpi']= 120
import matplotlib.pyplot as plt
def train(batch_size, lr, mom, epochs, period):
assert period >= batch_size and period % batch_size == 0
net.collect_params().initialize(mx.init.Normal(sigma=1), force_reinit=True)
# 动量法。
trainer = gluon.Trainer(net.collect_params(), 'sgd',
{'learning_rate': lr, 'momentum': mom})
data_iter = gluon.data.DataLoader(dataset, batch_size, shuffle=True)
total_loss = [np.mean(square_loss(net(X), y).asnumpy())]
for epoch in range(1, epochs + 1):
# 重设学习率。
if epoch > 2:
trainer.set_learning_rate(trainer.learning_rate * 0.1)
for batch_i, (data, label) in enumerate(data_iter):
with autograd.record():
output = net(data)
loss = square_loss(output, label)
loss.backward()
trainer.step(batch_size)
if batch_i * batch_size % period == 0:
total_loss.append(np.mean(square_loss(net(X), y).asnumpy()))
print("Batch size %d, Learning rate %f, Epoch %d, loss %.4e" %
(batch_size, trainer.learning_rate, epoch, total_loss[-1]))
print('w:', np.reshape(net[0].weight.data().asnumpy(), (1, -1)),
'b:', net[0].bias.data().asnumpy()[0], '\n')
x_axis = np.linspace(0, epochs, len(total_loss), endpoint=True)
plt.semilogy(x_axis, total_loss)
plt.xlabel('epoch')
plt.ylabel('loss')
plt.show()
%config InlineBackend.figure_format = 'retina'
import numpy as np
import sys
sys.path.append('..')
import utils
```
使用动量法,最终学到的参数值与真实值较接近。
```
{.python .input n=3}
train(batch_size=10, lr=0.2, mom=0.9, epochs=3, period=10)
net.collect_params().initialize(mx.init.Normal(sigma=1), force_reinit=True)
trainer = gluon.Trainer(net.collect_params(), 'sgd',
{'learning_rate': 0.2, 'momentum': 0.9})
utils.optimize(batch_size=10, trainer=trainer, num_epochs=3, decay_epoch=2,
log_interval=10, X=X, y=y, net=net)
```
## 结论
...
...
chapter_optimization/momentum-scratch.md
浏览文件 @
ae48f45b
...
...
@@ -65,6 +65,7 @@ from mxnet import ndarray as nd
from mxnet import gluon
import random
# 为方便比较同一优化算法的从零开始实现和Gluon实现,将输出保持确定。
mx.random.seed(1)
random.seed(1)
...
...
@@ -76,16 +77,6 @@ true_b = 4.2
X = nd.random_normal(scale=1, shape=(num_examples, num_inputs))
y = true_w[0] * X[:, 0] + true_w[1] * X[:, 1] + true_b
y += .01 * nd.random_normal(scale=1, shape=y.shape)
dataset = gluon.data.ArrayDataset(X, y)
# 构造迭代器。
import random
def data_iter(batch_size):
idx = list(range(num_examples))
random.shuffle(idx)
for batch_i, i in enumerate(range(0, num_examples, batch_size)):
j = nd.array(idx[i: min(i + batch_size, num_examples)])
yield batch_i, X.take(j), y.take(j)
# 初始化模型参数。
def init_params():
...
...
@@ -98,49 +89,49 @@ def init_params():
# 把速度项初始化为和参数形状相同的零张量。
vs.append(param.zeros_like())
return params, vs
# 线性回归模型。
def net(X, w, b):
return nd.dot(X, w) + b
# 损失函数。
def square_loss(yhat, y):
return (yhat - y.reshape(yhat.shape)) ** 2 / 2
```
接下来定义训练函数。当epoch大于2时(epoch从1开始计数),学习率以自乘0.1的方式自我衰减。训练函数的period参数说明,每次采样过该数目的数据点后,记录当前目标函数值用于作图。例如,当period和batch_size都为10时,每次迭代后均会记录目标函数值。
```
{.python .input n=3}
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
import matplotlib as mpl
mpl.rcParams['figure.dpi']= 120
import matplotlib.pyplot as plt
import numpy as np
def train(batch_size, lr, mom, epochs, period):
assert period >= batch_size and period % batch_size == 0
[w, b], vs = init_params()
total_loss = [np.mean(square_loss(net(X, w, b), y).asnumpy())]
import sys
sys.path.append('..')
import utils
net = utils.linreg
squared_loss = utils.squared_loss
# 注意epoch从1开始计数。
for epoch in range(1, epochs + 1):
# 重设学习率。
def optimize(batch_size, lr, mom, num_epochs, log_interval):
[w, b], vs = init_params()
y_vals = [nd.mean(squared_loss(net(X, w, b), y)).asnumpy()]
print('batch size', batch_size)
for epoch in range(1, num_epochs + 1):
# 学习率自我衰减。
if epoch > 2:
lr *= 0.1
for batch_i, data, label in data_iter(batch_size):
for batch_i, features, label in utils.data_iter(
batch_size, num_examples, random, X, y):
with autograd.record():
output = net(
data
, w, b)
loss = square_loss(output, label)
output = net(
features
, w, b)
loss = square
d
_loss(output, label)
loss.backward()
sgd_momentum([w, b], vs, lr, mom, batch_size)
if batch_i * batch_size % period == 0:
total_loss.append(np.mean(square_loss(net(X, w, b), y).asnumpy()))
print("Batch size %d, Learning rate %f, Epoch %d, loss %.4e" %
(batch_size, lr, epoch, total_loss[-1]))
print('w:', np.reshape(w.asnumpy(), (1, -1)),
if batch_i * batch_size % log_interval == 0:
y_vals.append(
nd.mean(squared_loss(net(X, w, b), y)).asnumpy())
print('epoch %d, learning rate %f, loss %.4e' %
(epoch, lr, y_vals[-1]))
print('w:', np.reshape(w.asnumpy(), (1, -1)),
'b:', b.asnumpy()[0], '\n')
x_axis = np.linspace(0, epochs, len(total_loss), endpoint=True)
plt.semilogy(x_axis, total_loss)
x_vals = np.linspace(0, num_epochs, len(y_vals), endpoint=True)
utils.set_fig_size(mpl)
plt.semilogy(x_vals, y_vals)
plt.xlabel('epoch')
plt.ylabel('loss')
plt.show()
...
...
@@ -149,7 +140,7 @@ def train(batch_size, lr, mom, epochs, period):
使用动量法,最终学到的参数值与真实值较接近。
```
{.python .input n=4}
train(batch_size=10, lr=0.2, mom=0.9, epochs=3, period
=10)
optimize(batch_size=10, lr=0.2, mom=0.9, num_epochs=3, log_interval
=10)
```
## 结论
...
...
chapter_optimization/rmsprop-gluon.md
浏览文件 @
ae48f45b
...
...
@@ -7,12 +7,12 @@
import mxnet as mx
from mxnet import autograd
from mxnet import gluon
from mxnet import ndarray as nd
import numpy as np
from mxnet import nd
import random
mx.random.seed(1)
# 为方便比较同一优化算法的从零开始实现和Gluon实现,将输出保持确定。
random.seed(1)
mx.random.seed(1)
# 生成数据集。
num_inputs = 2
...
...
@@ -22,61 +22,41 @@ true_b = 4.2
X = nd.random_normal(scale=1, shape=(num_examples, num_inputs))
y = true_w[0] * X[:, 0] + true_w[1] * X[:, 1] + true_b
y += .01 * nd.random_normal(scale=1, shape=y.shape)
dataset = gluon.data.ArrayDataset(X, y)
# 创建模型和定义损失函数。
net = gluon.nn.Sequential()
net.add(gluon.nn.Dense(1))
square_loss = gluon.loss.L2Loss()
```
我们需要在
`gluon.Trainer`
中指定优化算法名称
`rmsprop`
并设置参数。例如设置初始学习率
`learning_rate`
和指数加权移动平均中gamma1参数。
```
{.python .input n=2}
%matplotlib inline
import matplotlib as mpl
mpl.rcParams['figure.dpi']= 120
import matplotlib.pyplot as plt
def train(batch_size, lr, gamma, epochs, period):
assert period >= batch_size and period % batch_size == 0
net.collect_params().initialize(mx.init.Normal(sigma=1), force_reinit=True)
# RMSProp。
trainer = gluon.Trainer(net.collect_params(), 'rmsprop',
{'learning_rate': lr, 'gamma1': gamma})
data_iter = gluon.data.DataLoader(dataset, batch_size, shuffle=True)
total_loss = [np.mean(square_loss(net(X), y).asnumpy())]
for epoch in range(1, epochs + 1):
for batch_i, (data, label) in enumerate(data_iter):
with autograd.record():
output = net(data)
loss = square_loss(output, label)
loss.backward()
trainer.step(batch_size)
if batch_i * batch_size % period == 0:
total_loss.append(np.mean(square_loss(net(X), y).asnumpy()))
print("Batch size %d, Learning rate %f, Epoch %d, loss %.4e" %
(batch_size, trainer.learning_rate, epoch, total_loss[-1]))
print('w:', np.reshape(net[0].weight.data().asnumpy(), (1, -1)),
'b:', net[0].bias.data().asnumpy()[0], '\n')
x_axis = np.linspace(0, epochs, len(total_loss), endpoint=True)
plt.semilogy(x_axis, total_loss)
plt.xlabel('epoch')
plt.ylabel('loss')
plt.show()
%config InlineBackend.figure_format = 'retina'
import numpy as np
import sys
sys.path.append('..')
import utils
```
我们将初始学习率设为0.03,并将gamma设为0.9。损失函数在迭代后期较震荡。
```
{.python .input n=3}
train(batch_size=10, lr=0.03, gamma=0.9, epochs=3, period=10)
net.collect_params().initialize(mx.init.Normal(sigma=1), force_reinit=True)
trainer = gluon.Trainer(net.collect_params(), 'rmsprop',
{'learning_rate': 0.03, 'gamma1': 0.9})
utils.optimize(batch_size=10, trainer=trainer, num_epochs=3, decay_epoch=None,
log_interval=10, X=X, y=y, net=net)
```
我们将gamma调大一点,例如0.999。这时损失函数在迭代后期较平滑。
```
{.python .input}
train(batch_size=10, lr=0.03, gamma=0.999, epochs=3, period=10)
net.collect_params().initialize(mx.init.Normal(sigma=1), force_reinit=True)
trainer = gluon.Trainer(net.collect_params(), 'rmsprop',
{'learning_rate': 0.03, 'gamma1': 0.999})
utils.optimize(batch_size=10, trainer=trainer, num_epochs=3, decay_epoch=None,
log_interval=10, X=X, y=y, net=net)
```
## 结论
...
...
chapter_optimization/rmsprop-scratch.md
浏览文件 @
ae48f45b
...
...
@@ -47,12 +47,13 @@ def rmsprop(params, sqrs, lr, gamma, batch_size):
实验中,我们以线性回归为例。其中真实参数
`w`
为[2, -3.4],
`b`
为4.2。我们把梯度按元素平方的指数加权移动平均变量初始化为和参数形状相同的零张量。
```
{.python .input n=1}
from mxnet import ndarray as nd
import mxnet as mx
from mxnet import autograd
from mxnet import gluon
from mxnet import nd
import random
# 为方便比较同一优化算法的从零开始实现和Gluon实现,将输出保持确定。
mx.random.seed(1)
random.seed(1)
...
...
@@ -64,16 +65,6 @@ true_b = 4.2
X = nd.random_normal(scale=1, shape=(num_examples, num_inputs))
y = true_w[0] * X[:, 0] + true_w[1] * X[:, 1] + true_b
y += .01 * nd.random_normal(scale=1, shape=y.shape)
dataset = gluon.data.ArrayDataset(X, y)
# 构造迭代器。
import random
def data_iter(batch_size):
idx = list(range(num_examples))
random.shuffle(idx)
for batch_i, i in enumerate(range(0, num_examples, batch_size)):
j = nd.array(idx[i: min(i + batch_size, num_examples)])
yield batch_i, X.take(j), y.take(j)
# 初始化模型参数。
def init_params():
...
...
@@ -86,46 +77,46 @@ def init_params():
# 把梯度按元素平方的指数加权移动平均变量初始化为和参数形状相同的零张量。
sqrs.append(param.zeros_like())
return params, sqrs
# 线性回归模型。
def net(X, w, b):
return nd.dot(X, w) + b
# 损失函数。
def square_loss(yhat, y):
return (yhat - y.reshape(yhat.shape)) ** 2 / 2
```
接下来定义训练函数。训练函数的period参数说明,每次采样过该数目的数据点后,记录当前目标函数值用于作图。例如,当period和batch_size都为10时,每次迭代后均会记录目标函数值。
```
{.python .input n=2}
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
import matplotlib as mpl
mpl.rcParams['figure.dpi']= 120
import matplotlib.pyplot as plt
import numpy as np
def train(batch_size, lr, gamma, epochs, period):
assert period >= batch_size and period % batch_size == 0
[w, b], sqrs = init_params()
total_loss = [np.mean(square_loss(net(X, w, b), y).asnumpy())]
import sys
sys.path.append('..')
import utils
net = utils.linreg
squared_loss = utils.squared_loss
# 注意epoch从1开始计数。
for epoch in range(1, epochs + 1):
for batch_i, data, label in data_iter(batch_size):
def optimize(batch_size, lr, gamma, num_epochs, log_interval):
[w, b], sqrs = init_params()
y_vals = [nd.mean(squared_loss(net(X, w, b), y)).asnumpy()]
print('batch size', batch_size)
for epoch in range(1, num_epochs + 1):
for batch_i, features, label in utils.data_iter(
batch_size, num_examples, random, X, y):
with autograd.record():
output = net(
data
, w, b)
loss = square_loss(output, label)
output = net(
features
, w, b)
loss = square
d
_loss(output, label)
loss.backward()
rmsprop([w, b], sqrs, lr, gamma, batch_size)
if batch_i * batch_size % period == 0:
total_loss.append(np.mean(square_loss(net(X, w, b), y).asnumpy()))
print("Batch size %d, Learning rate %f, Epoch %d, loss %.4e" %
(batch_size, lr, epoch, total_loss[-1]))
if batch_i * batch_size % log_interval == 0:
y_vals.append(
nd.mean(squared_loss(net(X, w, b), y)).asnumpy())
print('epoch %d, learning rate %f, loss %.4e' %
(epoch, lr, y_vals[-1]))
print('w:', np.reshape(w.asnumpy(), (1, -1)),
'b:', b.asnumpy()[0], '\n')
x_axis = np.linspace(0, epochs, len(total_loss), endpoint=True)
plt.semilogy(x_axis, total_loss)
x_vals = np.linspace(0, num_epochs, len(y_vals), endpoint=True)
utils.set_fig_size(mpl)
plt.semilogy(x_vals, y_vals)
plt.xlabel('epoch')
plt.ylabel('loss')
plt.show()
...
...
@@ -134,13 +125,13 @@ def train(batch_size, lr, gamma, epochs, period):
我们将初始学习率设为0.03,并将gamma设为0.9。损失函数在迭代后期较震荡。
```
{.python .input n=3}
train(batch_size=10, lr=0.03, gamma=0.9, epochs=3, period
=10)
optimize(batch_size=10, lr=0.03, gamma=0.9, num_epochs=3, log_interval
=10)
```
我们将gamma调大一点,例如0.999。这时损失函数在迭代后期较平滑。
```
{.python .input}
train(batch_size=10, lr=0.03, gamma=0.999, epochs=3, period
=10)
optimize(batch_size=10, lr=0.03, gamma=0.999, num_epochs=3, log_interval
=10)
```
## 结论
...
...
utils.py
浏览文件 @
ae48f45b
...
...
@@ -369,7 +369,7 @@ def squared_loss(yhat, y):
def
optimize
(
batch_size
,
trainer
,
num_epochs
,
decay_epoch
,
log_interval
,
X
,
y
,
net
):
net
,
print_lr
=
True
):
"""优化目标函数。"""
dataset
=
gluon
.
data
.
ArrayDataset
(
X
,
y
)
data_iter
=
gluon
.
data
.
DataLoader
(
dataset
,
batch_size
,
shuffle
=
True
)
...
...
@@ -388,8 +388,11 @@ def optimize(batch_size, trainer, num_epochs, decay_epoch, log_interval, X, y,
trainer
.
step
(
batch_size
)
if
batch_i
*
batch_size
%
log_interval
==
0
:
y_vals
.
append
(
nd
.
mean
(
square_loss
(
net
(
X
),
y
)).
asnumpy
())
print
(
"epoch %d, learning rate %f, loss %.4e"
%
(
epoch
,
trainer
.
learning_rate
,
y_vals
[
-
1
]))
if
print_lr
:
print
(
"epoch %d, learning rate %f, loss %.4e"
%
(
epoch
,
trainer
.
learning_rate
,
y_vals
[
-
1
]))
else
:
print
(
"epoch %d, loss %.4e"
%
(
epoch
,
y_vals
[
-
1
]))
print
(
'w:'
,
np
.
reshape
(
net
[
0
].
weight
.
data
().
asnumpy
(),
(
1
,
-
1
)),
'b:'
,
net
[
0
].
bias
.
data
().
asnumpy
()[
0
],
'
\n
'
)
x_vals
=
np
.
linspace
(
0
,
num_epochs
,
len
(
y_vals
),
endpoint
=
True
)
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录