提交 7fbc02f3 编写于 作者: A Aston Zhang

optimization default args

上级 64ab5b5a
......@@ -79,8 +79,7 @@ def adadelta(params_vars, hyperparams, batch_size):
可以看出,优化所得的模型参数值与它们的真实值较接近。
```{.python .input n=3}
gb.optimize(optimizer_fn=adadelta, batch_size=10, num_epochs=3,
log_interval=10, params_vars=init_params_vars(),
gb.optimize(optimizer_fn=adadelta, params_vars=init_params_vars(),
hyperparams={'rho': 0.9999}, features=features, labels=labels)
```
......@@ -94,9 +93,8 @@ net.add(nn.Dense(1))
net.initialize(init.Normal(sigma=0.01), force_reinit=True)
trainer = gluon.Trainer(net.collect_params(), 'adadelta', {'rho': 0.9999})
gb.optimize_with_trainer(batch_size=10, trainer=trainer, num_epochs=3,
decay_epoch=None, log_interval=10, features=features,
labels=labels, net=net)
gb.optimize_with_trainer(trainer=trainer, features=features, labels=labels,
net=net)
```
## 小结
......
......@@ -87,8 +87,7 @@ def adagrad(params_vars, hyperparams, batch_size):
实验中的初始学习率`lr`未作自我衰减。最终,优化所得的模型参数值与它们的真实值较接近。
```{.python .input n=4}
gb.optimize(optimizer_fn=adagrad, batch_size=10, num_epochs=3,
log_interval=10, params_vars=init_params_vars(),
gb.optimize(optimizer_fn=adagrad, params_vars=init_params_vars(),
hyperparams={'lr': 0.9}, features=features, labels=labels)
```
......@@ -103,9 +102,8 @@ net.add(nn.Dense(1))
net.initialize(init.Normal(sigma=0.01), force_reinit=True)
trainer = gluon.Trainer(net.collect_params(), 'adagrad',
{'learning_rate': 0.9})
gb.optimize_with_trainer(batch_size=10, trainer=trainer, num_epochs=3,
decay_epoch=None, log_interval=10, features=features,
labels=labels, net=net)
gb.optimize_with_trainer(trainer=trainer, features=features, labels=labels,
net=net)
```
## 小结
......
......@@ -93,9 +93,9 @@ def adam(params_vars, hyperparams, batch_size, t):
可以看出,优化所得的模型参数值与它们的真实值较接近。
```{.python .input n=3}
gb.optimize(optimizer_fn=adam, batch_size=10, num_epochs=3, log_interval=10,
params_vars=init_params_vars(), hyperparams={'lr': 0.1},
features=features, labels=labels, decay_epoch=None, is_adam=True)
gb.optimize(optimizer_fn=adam, params_vars=init_params_vars(),
hyperparams={'lr': 0.1}, features=features, labels=labels,
is_adam=True)
```
## 使用Gluon的实现
......@@ -108,9 +108,8 @@ net.add(nn.Dense(1))
net.initialize(init.Normal(sigma=0.01), force_reinit=True)
trainer = gluon.Trainer(net.collect_params(), 'adam', {'learning_rate': 0.1})
gb.optimize_with_trainer(batch_size=10, trainer=trainer, num_epochs=3,
decay_epoch=None, log_interval=10, features=features,
labels=labels, net=net)
gb.optimize_with_trainer(trainer=trainer, features=features, labels=labels,
net=net)
```
## 小结
......
......@@ -192,8 +192,8 @@ def sgd(params_vars, hyperparams, batch_size):
由于随机梯度的方差在迭代过程中无法减小,(小批量)随机梯度下降的学习率通常会采用自我衰减的方式。如此一来,学习率和随机梯度乘积的方差会衰减。实验中,当迭代周期(`epoch`)大于2时,(小批量)随机梯度下降的学习率在每个迭代周期开始时自乘0.1作自我衰减。而梯度下降在迭代过程中一直使用目标函数的真实梯度,无需自我衰减学习率。在迭代过程中,每当`log_interval`个样本被采样过后,模型当前的损失函数值(`loss`)被记录下并用于作图。例如,当`batch_size``log_interval`都为10时,每次迭代后的损失函数值都被用来作图。
```{.python .input n=9}
def optimize(optimizer_fn, batch_size, num_epochs, log_interval,
params_vars, hyperparams, features, labels, decay_epoch=None,
def optimize(optimizer_fn, params_vars, hyperparams, features, labels,
decay_epoch=None, batch_size=10, log_interval=10, num_epochs=3,
is_adam=False):
dataset = gdata.ArrayDataset(features, labels)
data_iter = gdata.DataLoader(dataset, batch_size, shuffle=True)
......@@ -230,42 +230,42 @@ def optimize(optimizer_fn, batch_size, num_epochs, log_interval,
当批量大小为1时,优化使用的是随机梯度下降。在当前学习率下,损失函数值在早期快速下降后略有波动。这是由于随机梯度的方差在迭代过程中无法减小。当迭代周期大于2,学习率自我衰减后,损失函数值下降后较平稳。最终,优化所得的模型参数值`w``b`与它们的真实值[2, -3.4]和4.2较接近。
```{.python .input n=10}
optimize(optimizer_fn=sgd, batch_size=1, num_epochs=3, log_interval=10,
params_vars=init_params_vars(), hyperparams={'lr': 0.2},
features=features, labels=labels, decay_epoch=2)
optimize(optimizer_fn=sgd, params_vars=init_params_vars(),
hyperparams={'lr': 0.2}, features=features, labels=labels,
decay_epoch=2, batch_size=1)
```
当批量大小为1000时,由于数据样本总数也是1000,优化使用的是梯度下降。梯度下降无需自我衰减学习率(`decay_epoch=None`)。最终,优化所得的模型参数值与它们的真实值较接近。需要注意的是,梯度下降的1个迭代周期对模型参数只迭代1次。而随机梯度下降的批量大小为1,它在1个迭代周期对模型参数迭代了1000次。我们观察到,1个迭代周期后,梯度下降所得的损失函数值比随机梯度下降所得的损失函数值略大。而在3个迭代周期后,梯度下降和随机梯度下降得到的损失函数值较接近。
```{.python .input n=11}
optimize(optimizer_fn=sgd, batch_size=1000, num_epochs=3, log_interval=1000,
params_vars=init_params_vars(), hyperparams={'lr': 0.999},
features=features, labels=labels, decay_epoch=None)
optimize(optimizer_fn=sgd, params_vars=init_params_vars(),
hyperparams={'lr': 0.999}, features=features, labels=labels,
decay_epoch=None, batch_size=1000, log_interval=1000)
```
当批量大小为10时,由于数据样本总数也是1000,优化使用的是小批量随机梯度下降。最终,优化所得的模型参数值与它们的真实值较接近。
```{.python .input n=12}
optimize(optimizer_fn=sgd, batch_size=10, num_epochs=3, log_interval=10,
params_vars=init_params_vars(), hyperparams={'lr': 0.2},
features=features, labels=labels, decay_epoch=2)
optimize(optimizer_fn=sgd, params_vars=init_params_vars(),
hyperparams={'lr': 0.2}, features=features, labels=labels,
decay_epoch=2, batch_size=10)
```
同样是批量大小为10,我们把学习率改大。这时损失函数值不断增大,直到出现“nan”(not a number,非数)。
这是因为,过大的学习率造成了模型参数越过最优解并发散。最终学到的模型参数也是“nan”。
```{.python .input n=13}
optimize(optimizer_fn=sgd, batch_size=10, num_epochs=3, log_interval=10,
params_vars=init_params_vars(), hyperparams={'lr': 5},
features=features, labels=labels, decay_epoch=2)
optimize(optimizer_fn=sgd, params_vars=init_params_vars(),
hyperparams={'lr': 5}, features=features, labels=labels,
decay_epoch=2, batch_size=10)
```
同样是批量大小为10,我们把学习率改小。这时我们观察到损失函数值下降较慢,直到3个迭代周期模型参数也没能接近它们的真实值。
```{.python .input n=14}
optimize(optimizer_fn=sgd, batch_size=10, num_epochs=3, log_interval=10,
params_vars=init_params_vars(), hyperparams={'lr': 0.002},
features=features, labels=labels, decay_epoch=2)
optimize(optimizer_fn=sgd, params_vars=init_params_vars(),
hyperparams={'lr': 0.002}, features=features, labels=labels,
decay_epoch=2, batch_size=10)
```
## 使用Gluon的实现
......@@ -277,9 +277,8 @@ optimize(optimizer_fn=sgd, batch_size=10, num_epochs=3, log_interval=10,
net = nn.Sequential()
net.add(nn.Dense(1))
# 优化目标函数。
def optimize_with_trainer(batch_size, trainer, num_epochs, decay_epoch,
log_interval, features, labels, net):
def optimize_with_trainer(trainer, features, labels, net, decay_epoch=None,
batch_size=10, log_interval=10, num_epochs=3):
dataset = gdata.ArrayDataset(features, labels)
data_iter = gdata.DataLoader(dataset, batch_size, shuffle=True)
loss = gloss.L2Loss()
......@@ -308,25 +307,23 @@ def optimize_with_trainer(batch_size, trainer, num_epochs, decay_epoch,
```{.python .input}
net.initialize(init.Normal(sigma=0.01), force_reinit=True)
trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': 0.2})
optimize_with_trainer(batch_size=1, trainer=trainer, num_epochs=3,
decay_epoch=2, log_interval=10, features=features,
labels=labels, net=net)
optimize_with_trainer(trainer=trainer, features=features, labels=labels,
net=net, decay_epoch=2, batch_size=1)
```
```{.python .input}
net.initialize(init.Normal(sigma=0.01), force_reinit=True)
trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': 0.999})
optimize_with_trainer(batch_size=1000, trainer=trainer, num_epochs=3,
decay_epoch=None, log_interval=1000, features=features,
labels=labels, net=net)
optimize_with_trainer(trainer=trainer, features=features, labels=labels,
net=net, decay_epoch=None, batch_size=1000,
log_interval=1000)
```
```{.python .input}
net.initialize(init.Normal(sigma=0.01), force_reinit=True)
trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': 0.2})
optimize_with_trainer(batch_size=10, trainer=trainer, num_epochs=3,
decay_epoch=2, log_interval=10, features=features,
labels=labels, net=net)
optimize_with_trainer(trainer=trainer, features=features, labels=labels,
net=net, decay_epoch=2, batch_size=10)
```
本节使用的`get_data_ch7``optimize``optimize_with_trainer`函数被定义在`gluonbook`包中供后面章节调用。
......
......@@ -177,8 +177,7 @@ def sgd_momentum(params_vars, hyperparams, batch_size):
我们先将动量超参数$\gamma$(`mom`)设0.99。此时,小梯度随机梯度下降可被看作使用了特殊梯度:这个特殊梯度是最近100个时刻的$100\nabla f_\mathcal{B}(\boldsymbol{x})$的加权平均。我们观察到,损失函数值在3个迭代周期后上升。这很可能是由于特殊梯度中较大的系数100造成的。
```{.python .input n=10}
gb.optimize(optimizer_fn=sgd_momentum, batch_size=10, num_epochs=3,
log_interval=10, params_vars=init_params_vars(),
gb.optimize(optimizer_fn=sgd_momentum, params_vars=init_params_vars(),
hyperparams={'lr': 0.2, 'mom': 0.99}, features=features,
labels=labels, decay_epoch=2)
```
......@@ -186,8 +185,7 @@ gb.optimize(optimizer_fn=sgd_momentum, batch_size=10, num_epochs=3,
假设学习率不变,为了降低上述特殊梯度中的系数,我们将动量超参数$\gamma$(`mom`)设0.9。此时,上述特殊梯度变成最近10个时刻的$10\nabla f_\mathcal{B}(\boldsymbol{x})$的加权平均。我们观察到,损失函数值在3个迭代周期后下降。
```{.python .input n=11}
gb.optimize(optimizer_fn=sgd_momentum, batch_size=10, num_epochs=3,
log_interval=10, params_vars=init_params_vars(),
gb.optimize(optimizer_fn=sgd_momentum, params_vars=init_params_vars(),
hyperparams={'lr': 0.2, 'mom': 0.9}, features=features,
labels=labels, decay_epoch=2)
```
......@@ -195,8 +193,7 @@ gb.optimize(optimizer_fn=sgd_momentum, batch_size=10, num_epochs=3,
继续保持学习率不变,我们将动量超参数$\gamma$(`mom`)设0.5。此时,小梯度随机梯度下降可被看作使用了新的特殊梯度:这个特殊梯度是最近2个时刻的$2\nabla f_\mathcal{B}(\boldsymbol{x})$的加权平均。我们观察到,损失函数值在3个迭代周期后下降,且下降曲线较平滑。最终,优化所得的模型参数值与它们的真实值较接近。
```{.python .input n=12}
gb.optimize(optimizer_fn=sgd_momentum, batch_size=10, num_epochs=3,
log_interval=10, params_vars=init_params_vars(),
gb.optimize(optimizer_fn=sgd_momentum, params_vars=init_params_vars(),
hyperparams={'lr': 0.2, 'mom': 0.5}, features=features,
labels=labels, decay_epoch=2)
```
......@@ -212,27 +209,24 @@ net.add(nn.Dense(1))
net.initialize(init.Normal(sigma=0.01), force_reinit=True)
trainer = gluon.Trainer(net.collect_params(), 'sgd',
{'learning_rate': 0.2, 'momentum': 0.99})
gb.optimize_with_trainer(batch_size=10, trainer=trainer, num_epochs=3,
decay_epoch=2, log_interval=10, features=features,
labels=labels, net=net)
gb.optimize_with_trainer(trainer=trainer, features=features, labels=labels,
net=net, decay_epoch=2)
```
```{.python .input}
net.initialize(init.Normal(sigma=0.01), force_reinit=True)
trainer = gluon.Trainer(net.collect_params(), 'sgd',
{'learning_rate': 0.2, 'momentum': 0.9})
gb.optimize_with_trainer(batch_size=10, trainer=trainer, num_epochs=3,
decay_epoch=2, log_interval=10, features=features,
labels=labels, net=net)
gb.optimize_with_trainer(trainer=trainer, features=features, labels=labels,
net=net, decay_epoch=2)
```
```{.python .input}
net.initialize(init.Normal(sigma=0.01), force_reinit=True)
trainer = gluon.Trainer(net.collect_params(), 'sgd',
{'learning_rate': 0.2, 'momentum': 0.5})
gb.optimize_with_trainer(batch_size=10, trainer=trainer, num_epochs=3,
decay_epoch=2, log_interval=10, features=features,
labels=labels, net=net)
gb.optimize_with_trainer(trainer=trainer, features=features, labels=labels,
net=net, decay_epoch=2)
```
## 小结
......
......@@ -76,8 +76,7 @@ def rmsprop(params_vars, hyperparams, batch_size):
我们将初始学习率设为0.03,并将$\gamma$(`gamma`)设为0.9。此时,变量$\boldsymbol{s}$可看作是最近$1/(1-0.9) = 10$个时刻的平方项$\boldsymbol{g} \odot \boldsymbol{g}$的加权平均。我们观察到,损失函数在迭代后期较震荡。
```{.python .input n=3}
gb.optimize(optimizer_fn=rmsprop, batch_size=10, num_epochs=3,
log_interval=10, params_vars=init_params_vars(),
gb.optimize(optimizer_fn=rmsprop, params_vars=init_params_vars(),
hyperparams={'lr': 0.03, 'gamma': 0.9}, features=features,
labels=labels)
```
......@@ -85,8 +84,7 @@ gb.optimize(optimizer_fn=rmsprop, batch_size=10, num_epochs=3,
我们将$\gamma$调大一点,例如0.999。此时,变量$\boldsymbol{s}$可看作是最近$1/(1-0.999) = 1000$个时刻的平方项$\boldsymbol{g} \odot \boldsymbol{g}$的加权平均。这时损失函数在迭代后期较平滑。
```{.python .input}
gb.optimize(optimizer_fn=rmsprop, batch_size=10, num_epochs=3,
log_interval=10, params_vars=init_params_vars(),
gb.optimize(optimizer_fn=rmsprop, params_vars=init_params_vars(),
hyperparams={'lr': 0.03, 'gamma': 0.999}, features=features,
labels=labels)
```
......@@ -102,18 +100,16 @@ net.add(nn.Dense(1))
net.initialize(init.Normal(sigma=0.01), force_reinit=True)
trainer = gluon.Trainer(net.collect_params(), 'rmsprop',
{'learning_rate': 0.03, 'gamma1': 0.9})
gb.optimize_with_trainer(batch_size=10, trainer=trainer, num_epochs=3,
decay_epoch=None, log_interval=10, features=features,
labels=labels, net=net)
gb.optimize_with_trainer(trainer=trainer, features=features, labels=labels,
net=net)
```
```{.python .input}
net.initialize(init.Normal(sigma=0.01), force_reinit=True)
trainer = gluon.Trainer(net.collect_params(), 'rmsprop',
{'learning_rate': 0.03, 'gamma1': 0.999})
gb.optimize_with_trainer(batch_size=10, trainer=trainer, num_epochs=3,
decay_epoch=None, log_interval=10, features=features,
labels=labels, net=net)
gb.optimize_with_trainer(trainer=trainer, features=features, labels=labels,
net=net)
```
## 小结
......
......@@ -254,8 +254,8 @@ def _make_list(obj, default_values=None):
return obj
def optimize(optimizer_fn, batch_size, num_epochs, log_interval,
params_vars, hyperparams, features, labels, decay_epoch=None,
def optimize(optimizer_fn, params_vars, hyperparams, features, labels,
decay_epoch=None, batch_size=10, log_interval=10, num_epochs=3,
is_adam=False):
"""Optimize an objective function."""
dataset = gdata.ArrayDataset(features, labels)
......@@ -286,8 +286,8 @@ def optimize(optimizer_fn, batch_size, num_epochs, log_interval,
semilogy(es, ls, 'epoch', 'loss')
def optimize_with_trainer(batch_size, trainer, num_epochs, decay_epoch,
log_interval, features, labels, net):
def optimize_with_trainer(trainer, features, labels, net, decay_epoch=None,
batch_size=10, log_interval=10, num_epochs=3):
"""Optimize an objective function with a Gluon trainer."""
dataset = gdata.ArrayDataset(features, labels)
data_iter = gdata.DataLoader(dataset, batch_size, shuffle=True)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册