diff --git a/chapter02_supervised-learning/02.md b/chapter02_supervised-learning/02.md index 50dfb2edae055048e5b6df4a3f1012d0dc13ab89..740677cd060f59bbdfaadc9c213f6877455c0413 100644 --- a/chapter02_supervised-learning/02.md +++ b/chapter02_supervised-learning/02.md @@ -14,6 +14,8 @@ underfit-overfit reg-scratch reg-gluon + dropout-scratch + dropout-gluon kaggle-gluon-kfold - + ``` diff --git a/chapter02_supervised-learning/dropout-gluon.md b/chapter02_supervised-learning/dropout-gluon.md index bc08022dca867da71b4e83233892a1239e0b52b7..12606e6b28356a888285626c39bc6e8434ab9664 100644 --- a/chapter02_supervised-learning/dropout-gluon.md +++ b/chapter02_supervised-learning/dropout-gluon.md @@ -9,23 +9,23 @@ 更靠近输入层的元素丢弃概率设的更小一点。这个试验中,我们把第一层全连接后的元素丢弃概率设为0.2,把第二层全连接后的元素丢弃概率设为0.5。 ```{.python .input n=5} -from mxnet import gluon +from mxnet.gluon import nn -net = gluon.nn.Sequential() +net = nn.Sequential() drop_prob1 = 0.2 drop_prob2 = 0.5 with net.name_scope(): - net.add(gluon.nn.Flatten()) + net.add(nn.Flatten()) # 第一层全连接。 - net.add(gluon.nn.Dense(256, activation="relu")) + net.add(nn.Dense(256, activation="relu")) # 在第一层全连接后添加丢弃层。 - net.add(gluon.nn.Dropout(drop_prob1)) + net.add(nn.Dropout(drop_prob1)) # 第二层全连接。 - net.add(gluon.nn.Dense(256, activation="relu")) + net.add(nn.Dense(256, activation="relu")) # 在第二层全连接后添加丢弃层。 - net.add(gluon.nn.Dropout(drop_prob2)) - net.add(gluon.nn.Dense(10)) + net.add(nn.Dropout(drop_prob2)) + net.add(nn.Dense(10)) net.initialize() ``` @@ -36,16 +36,17 @@ net.initialize() ```{.python .input n=6} import sys sys.path.append('..') -from mxnet import ndarray as nd -from mxnet import autograd import utils - +from mxnet import nd +from mxnet import autograd +from mxnet import gluon batch_size = 256 train_data, test_data = utils.load_data_fashion_mnist(batch_size) softmax_cross_entropy = gluon.loss.SoftmaxCrossEntropyLoss() -trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': 0.5}) +trainer = gluon.Trainer(net.collect_params(), + 'sgd', {'learning_rate': 0.5}) for epoch in range(5): train_loss = 0. @@ -62,7 +63,8 @@ for epoch in range(5): test_acc = utils.evaluate_accuracy(test_data, net) print("Epoch %d. Loss: %f, Train acc %f, Test acc %f" % ( - epoch, train_loss/len(train_data), train_acc/len(train_data), test_acc)) + epoch, train_loss/len(train_data), + train_acc/len(train_data), test_acc)) ``` ## 结论 diff --git a/chapter02_supervised-learning/dropout-scratch.md b/chapter02_supervised-learning/dropout-scratch.md index 60661ed34bde0fde09f2e061418ae5e0a19d1782..6d2723791aa855a322925b5efa46bc2d46b35058 100644 --- a/chapter02_supervised-learning/dropout-scratch.md +++ b/chapter02_supervised-learning/dropout-scratch.md @@ -1,7 +1,6 @@ # 丢弃法 --- 从0开始 -前面我们介绍了多层神经网络,就是包含至少一个隐含层的网络。我们也介绍了正则法来应对过拟合问题。在深度学习中,一个常用的应对过拟合问题的方法叫做丢弃法。本节以多层 -神经网络为例,从0开始介绍丢弃法。 +前面我们介绍了多层神经网络,就是包含至少一个隐含层的网络。我们也介绍了正则法来应对过拟合问题。在深度学习中,一个常用的应对过拟合问题的方法叫做丢弃法(Dropout)。本节以多层神经网络为例,从0开始介绍丢弃法。 由于丢弃法的概念和实现非常容易,在本节中,我们先介绍丢弃法的概念以及它在现代神经网络中是如何实现的。然后我们一起探讨丢弃法的本质。 @@ -20,6 +19,8 @@ 丢弃法的实现很容易,例如像下面这样。这里的标量`drop_probability`定义了一个`X`(`NDArray`类)中任何一个元素被丢弃的概率。 ```{.python .input} +from mxnet import nd + def dropout(X, drop_probability): keep_probability = 1 - drop_probability assert 0 <= keep_probability <= 1 @@ -28,8 +29,9 @@ def dropout(X, drop_probability): return X.zeros_like() # 随机选择一部分该层的输出作为丢弃元素。 - mask = nd.random_uniform(0, 1.0, X.shape, ctx=X.context) < keep_probability - # 这里keep_probability必不为0。 + mask = nd.random.uniform( + 0, 1.0, X.shape, ctx=X.context) < keep_probability + # 保证 E[dropout(X)] == X scale = 1 / keep_probability return mask * X * scale ``` @@ -37,8 +39,6 @@ def dropout(X, drop_probability): 我们运行几个实例来验证一下。 ```{.python .input} -from mxnet import ndarray as nd - A = nd.arange(20).reshape((5,4)) dropout(A, 0.0) ``` @@ -109,12 +109,6 @@ params = [W1, b1, W2, b2, W3, b3] for param in params: param.attach_grad() - -def relu(X): - return nd.maximum(X, 0) - -from mxnet import gluon -softmax_cross_entropy = gluon.loss.SoftmaxCrossEntropyLoss() ``` ## 定义包含丢弃层的模型 @@ -129,15 +123,14 @@ drop_prob2 = 0.5 def net(X): X = X.reshape((-1, num_inputs)) # 第一层全连接。 - h1 = relu(nd.dot(X, W1) + b1) + h1 = nd.relu(nd.dot(X, W1) + b1) # 在第一层全连接后添加丢弃层。 h1 = dropout(h1, drop_prob1) # 第二层全连接。 - h2 = relu(nd.dot(h1, W2) + b2) + h2 = nd.relu(nd.dot(h1, W2) + b2) # 在第二层全连接后添加丢弃层。 h2 = dropout(h2, drop_prob2) - output = nd.dot(h2, W3) + b3 - return output + return nd.dot(h2, W3) + b3 ``` ## 训练 @@ -145,7 +138,10 @@ def net(X): 训练跟之前一样。 ```{.python .input n=8} -from mxnet import autograd as autograd +from mxnet import autograd +from mxnet import gluon + +softmax_cross_entropy = gluon.loss.SoftmaxCrossEntropyLoss() learning_rate = .5 @@ -164,7 +160,8 @@ for epoch in range(5): test_acc = utils.evaluate_accuracy(test_data, net) print("Epoch %d. Loss: %f, Train acc %f, Test acc %f" % ( - epoch, train_loss/len(train_data), train_acc/len(train_data), test_acc)) + epoch, train_loss/len(train_data), + train_acc/len(train_data), test_acc)) ``` ## 总结 diff --git a/chapter02_supervised-learning/mlp-scratch.md b/chapter02_supervised-learning/mlp-scratch.md index 97cf8a1ec671f27ed89b8762ee6971994f7bdecc..e248b607b36bbcc78525e2e72ce93d2e1d313a0b 100644 --- a/chapter02_supervised-learning/mlp-scratch.md +++ b/chapter02_supervised-learning/mlp-scratch.md @@ -103,7 +103,8 @@ for epoch in range(5): test_acc = utils.evaluate_accuracy(test_data, net) print("Epoch %d. Loss: %f, Train acc %f, Test acc %f" % ( - epoch, train_loss/len(train_data), train_acc/len(train_data), test_acc)) + epoch, train_loss/len(train_data), + train_acc/len(train_data), test_acc)) ``` ## 总结