rnn code

8450429f · Aston Zhang · c5a424b9 · 8450429f · 8450429f · 8450429f
4 changed file
--- a/chapter_recurrent-neural-networks/gru-scratch.md
+++ b/chapter_recurrent-neural-networks/gru-scratch.md
@@ -151,13 +151,13 @@ def gru_rnn(inputs, H, *params):

 ```{.python .input  n=5}
 get_inputs = gb.to_onehot
-num_epochs = 350
+num_epochs = 150
 num_steps = 35
 batch_size = 32
-lr = 0.7
+lr = 0.25
 clipping_theta = 5
 prefixes = ['分开', '不分开']
-pred_period = 70
+pred_period = 30
 pred_len = 100

 gb.train_and_predict_rnn(gru_rnn, False, num_epochs, num_steps, num_hiddens,

--- a/chapter_recurrent-neural-networks/lstm-scratch.md
+++ b/chapter_recurrent-neural-networks/lstm-scratch.md
@@ -96,7 +96,12 @@ vocab_size = len(char_to_idx)
 以下部分对模型参数进行初始化。参数`hidden_dim`定义了隐含状态的长度。

 ```{.python .input  n=3}
-ctx = gb.try_gpu()
+#ctx = gb.try_gpu()
+
+import mxnet as mx
+ctx = mx.gpu(3)
+
+
 input_dim = vocab_size
 num_hiddens = 256
 output_dim = vocab_size
@@ -170,13 +175,13 @@ def lstm_rnn(inputs, state_h, state_c, *params):

 ```{.python .input  n=5}
 get_inputs = gb.to_onehot
-num_epochs = 450
+num_epochs = 150
 num_steps = 35
 batch_size = 32
-lr = 0.6
+lr = 0.25
 clipping_theta = 5
 prefixes = ['分开', '不分开']
-pred_period = 90
+pred_period = 30
 pred_len = 100

 gb.train_and_predict_rnn(lstm_rnn, False, num_epochs, num_steps, num_hiddens,

--- a/chapter_recurrent-neural-networks/rnn-scratch.md
+++ b/chapter_recurrent-neural-networks/rnn-scratch.md
@@ -345,74 +345,66 @@ $$\text{loss} = -\frac{1}{N} \sum_{i=1}^N \log p_{\text{target}_i}$$
 任何一个有效模型的困惑度值必须小于预测集中元素的数量。在本例中，困惑度必须小于字典中的字符数$|W|$。如果一个模型可以取得较低的困惑度的值（更靠近1），通常情况下，该模型预测更加准确。

 ```{.python .input  n=17}
-from math import exp
-           
-def train_and_predict_rnn(rnn, is_random_iter, epochs, num_steps, hidden_dim, 
-                          learning_rate, clipping_theta, batch_size,
-                          pred_period, pred_len, seqs, get_params, get_inputs,
-                          ctx, corpus_indices, idx_to_char, char_to_idx,
-                          is_lstm=False):
+def train_and_predict_rnn(rnn, is_random_iter, num_epochs, num_steps,
+                          num_hiddens, lr, clipping_theta, batch_size,
+                          vocab_size, pred_period, pred_len, prefixes,
+                          get_params, get_inputs, ctx, corpus_indices,
+                          idx_to_char, char_to_idx, is_lstm=False):
    if is_random_iter:
        data_iter = data_iter_random
    else:
        data_iter = data_iter_consecutive
    params = get_params()
-    
-    softmax_cross_entropy = gloss.SoftmaxCrossEntropyLoss()
+    loss = gloss.SoftmaxCrossEntropyLoss()

-    for e in range(1, epochs + 1):
-        # 如使用相邻批量采样，在同一个epoch中，隐含变量只需要在该epoch开始的时候初始化。
+    for epoch in range(1, num_epochs + 1):
+        # 如使用相邻批量采样，在同一个 epoch 中，隐藏变量只需要在该 epoch 开始时初始化。
        if not is_random_iter:
-            state_h = nd.zeros(shape=(batch_size, hidden_dim), ctx=ctx)
+            state_h = nd.zeros(shape=(batch_size, num_hiddens), ctx=ctx)
            if is_lstm:
-                # 当RNN使用LSTM时才会用到，这里可以忽略。
-                state_c = nd.zeros(shape=(batch_size, hidden_dim), ctx=ctx)
-        train_loss, num_examples = 0, 0
-        for data, label in data_iter(corpus_indices, batch_size, num_steps, 
-                                     ctx):
-            # 如使用随机批量采样，处理每个随机小批量前都需要初始化隐含变量。
+                state_c = nd.zeros(shape=(batch_size, num_hiddens), ctx=ctx)
+        train_l_sum = nd.array([0], ctx=ctx)
+        train_l_cnt = 0
+        for X, Y in data_iter(corpus_indices, batch_size, num_steps, ctx):
+            # 如使用随机批量采样，处理每个随机小批量前都需要初始化隐藏变量。
            if is_random_iter:
-                state_h = nd.zeros(shape=(batch_size, hidden_dim), ctx=ctx)
+                state_h = nd.zeros(shape=(batch_size, num_hiddens), ctx=ctx)
                if is_lstm:
-                    # 当RNN使用LSTM时才会用到，这里可以忽略。
-                    state_c = nd.zeros(shape=(batch_size, hidden_dim), ctx=ctx)
+                    state_c = nd.zeros(shape=(batch_size, num_hiddens),
+                                       ctx=ctx)
            # 如使用相邻批量采样，需要从计算图分离隐藏状态变量。
            else:
                state_h = state_h.detach()
                if is_lstm:
-                    state_c = state_c.detach()
+                    state_c = state_c.detach()       
            with autograd.record():
-                # outputs 尺寸：(batch_size, vocab_size)
+                # outputs 形状：(batch_size, vocab_size)
                if is_lstm:
-                    # 当RNN使用LSTM时才会用到，这里可以忽略。
-                    outputs, state_h, state_c = rnn(get_inputs(data, vocab_size), state_h,
-                                                    state_c, *params) 
+                    outputs, state_h, state_c = rnn(
+                        get_inputs(X, vocab_size), state_h, state_c, *params) 
                else:
-                    outputs, state_h = rnn(get_inputs(data, vocab_size), state_h, *params)
-                # 设t_ib_j为i时间批量中的j元素:
-                # label 尺寸：（batch_size * num_steps）
-                # label = [t_0b_0, t_0b_1, ..., t_1b_0, t_1b_1, ..., ]
-                label = label.T.reshape((-1,))
-                # 拼接outputs，尺寸：(batch_size * num_steps, vocab_size)。
+                    outputs, state_h = rnn(
+                        get_inputs(X, vocab_size), state_h, *params)
+                # 设 t_ib_j 为时间步 i 批量中的元素 j：
+                # Y 形状：（batch_size * num_steps）
+                # Y = [t_0b_0, t_0b_1, ..., t_1b_0, t_1b_1, ..., ]
+                y = Y.T.reshape((-1,))
+                # 拼接 outputs，形状：(batch_size * num_steps, vocab_size)。
                outputs = nd.concat(*outputs, dim=0)
-                # 经上述操作，outputs和label已对齐。
-                loss = softmax_cross_entropy(outputs, label)
-            loss.backward()
-
+                # loss(outputs, y) 形状：(batch_size * num_steps,)
+                l = loss(outputs, y)
+            l.backward()
            grad_clipping(params, clipping_theta, ctx)
-            gb.sgd(params, learning_rate, 1)
-
-            train_loss += nd.sum(loss).asscalar()
-            num_examples += loss.size
-
-        if e % pred_period == 0:
-            print("Epoch %d. Perplexity %f" % (e, 
-                                               exp(train_loss/num_examples)))
-            for seq in seqs:
-                print(' - ', predict_rnn(rnn, seq, pred_len, params,
-                      hidden_dim, vocab_size, ctx, idx_to_char, char_to_idx, get_inputs,
-                      is_lstm))
-            print()
+            gb.sgd(params, lr, 1)
+            train_l_sum = train_l_sum + l.sum()
+            train_l_cnt += l.size
+        if epoch % pred_period == 0:
+            print("\nepoch %d, perplexity %f"
+                  % (epoch, (train_l_sum / train_l_cnt).exp().asscalar()))
+            for prefix in prefixes:
+                print(' - ', predict_rnn(
+                    rnn, prefix, pred_len, params, num_hiddens, vocab_size,
+                    ctx, idx_to_char, char_to_idx, get_inputs, is_lstm))
 ```

 我们将`to_onehot`、`data_iter_random`、`data_iter_consecutive`、`grad_clipping`、`predict_rnn`和`train_and_predict_rnn`、函数定义在gluonbook包中供后面章节调用。
@@ -420,38 +412,32 @@ def train_and_predict_rnn(rnn, is_random_iter, epochs, num_steps, hidden_dim,
 以下定义模型参数和预测序列前缀。

 ```{.python .input}
-epochs = 200
+num_epochs = 200
 num_steps = 35
 batch_size = 32
-
-seq1 = '分开'
-seq2 = '不分开'
-seq3 = '战争中部队'
-seqs = [seq1, seq2, seq3]
+lr = 0.25
+clipping_theta = 5
+prefixes = ['分开', '不分开']
+pred_period = 40
+pred_len = 100
 ```

 我们先采用随机批量采样实验循环神经网络谱写歌词。我们假定谱写歌词的前缀分别为“分开”、“不分开”和“战争中部队”。

 ```{.python .input  n=18}
-train_and_predict_rnn(rnn=rnn, is_random_iter=False, epochs=200, num_steps=35,
-                      hidden_dim=num_hiddens, learning_rate=0.2,
-                      clipping_theta=5, batch_size=32, pred_period=40,
-                      pred_len=100, seqs=seqs, get_params=get_params,
-                      get_inputs=get_inputs, ctx=ctx,
-                      corpus_indices=corpus_indices, idx_to_char=idx_to_char,
-                      char_to_idx=char_to_idx)
+train_and_predict_rnn(rnn, True, num_epochs, num_steps, num_hiddens, lr,
+                      clipping_theta, batch_size, vocab_size, pred_period,
+                      pred_len, prefixes, get_params, get_inputs, ctx,
+                      corpus_indices, idx_to_char, char_to_idx)
 ```

 我们再采用相邻批量采样实验循环神经网络谱写歌词。

 ```{.python .input  n=19}
-train_and_predict_rnn(rnn=rnn, is_random_iter=False, epochs=200, num_steps=35,
-                      hidden_dim=num_hiddens, learning_rate=0.2,
-                      clipping_theta=5, batch_size=32, pred_period=40,
-                      pred_len=100, seqs=seqs, get_params=get_params,
-                      get_inputs=get_inputs, ctx=ctx,
-                      corpus_indices=corpus_indices, idx_to_char=idx_to_char,
-                      char_to_idx=char_to_idx)
+train_and_predict_rnn(rnn, False, num_epochs, num_steps, num_hiddens, lr,
+                      clipping_theta, batch_size, vocab_size, pred_period,
+                      pred_len, prefixes, get_params, get_inputs, ctx,
+                      corpus_indices, idx_to_char, char_to_idx)
 ```

 可以看到一开始学到简单的字符，然后简单的词，接着是复杂点的词，然后看上去似乎像个句子了。

--- a/gluonbook/utils.py
+++ b/gluonbook/utils.py
@@ -344,7 +344,7 @@ def train_and_predict_rnn(rnn, is_random_iter, num_epochs, num_steps,
            if is_lstm:
                state_c = nd.zeros(shape=(batch_size, num_hiddens), ctx=ctx)
        train_l_sum = nd.array([0], ctx=ctx)
-        num_iters = 0
+        train_l_cnt = 0
        for X, Y in data_iter(corpus_indices, batch_size, num_steps, ctx):
            if is_random_iter:
                state_h = nd.zeros(shape=(batch_size, num_hiddens), ctx=ctx)
@@ -364,15 +364,15 @@ def train_and_predict_rnn(rnn, is_random_iter, num_epochs, num_steps,
                        get_inputs(X, vocab_size), state_h, *params)
                y = Y.T.reshape((-1,))
                outputs = nd.concat(*outputs, dim=0)
-                l = loss(outputs, y).sum() / (batch_size * num_steps) 
+                l = loss(outputs, y)
            l.backward()
            grad_clipping(params, clipping_theta, ctx)
            sgd(params, lr, 1)
-            train_l_sum = train_l_sum + l
-            num_iters += 1
+            train_l_sum = train_l_sum + l.sum()
+            train_l_cnt += l.size
        if epoch % pred_period == 0:
            print("\nepoch %d, perplexity %f"
-                  % (epoch, (train_l_sum / num_iters).exp().asscalar()))
+                  % (epoch, (train_l_sum / train_l_cnt).exp().asscalar()))
            for prefix in prefixes:
                print(' - ', predict_rnn(
                    rnn, prefix, pred_len, params, num_hiddens, vocab_size,