diff --git a/chapter_recurrent-neural-networks/gru.md b/chapter_recurrent-neural-networks/gru.md
index 355adea3ede8449a76f17ae8ca634babe1af69e6..6baa0f5b51468790d36a5b4938c011bd7b571684 100644
--- a/chapter_recurrent-neural-networks/gru.md
+++ b/chapter_recurrent-neural-networks/gru.md
@@ -91,13 +91,13 @@ def get_params():
                 _one((num_hiddens, num_hiddens)),
                 nd.zeros(num_hiddens, ctx=ctx))
 
-    W_xz, W_hz, b_z = _three()  # 更新门参数。
-    W_xr, W_hr, b_r = _three()  # 重置门参数。
-    W_xh, W_hh, b_h = _three()  # 候选隐藏状态参数。
-    # 输出层参数。
+    W_xz, W_hz, b_z = _three()  # 更新门参数
+    W_xr, W_hr, b_r = _three()  # 重置门参数
+    W_xh, W_hh, b_h = _three()  # 候选隐藏状态参数
+    # 输出层参数
     W_hq = _one((num_hiddens, num_outputs))
     b_q = nd.zeros(num_outputs, ctx=ctx)
-    # 创建梯度。
+    # 附上梯度
     params = [W_xz, W_hz, b_z, W_xr, W_hr, b_r, W_xh, W_hh, b_h, W_hq, b_q]
     for param in params:
         param.attach_grad()
diff --git a/chapter_recurrent-neural-networks/lang-model-dataset.md b/chapter_recurrent-neural-networks/lang-model-dataset.md
index 57baa1a304bbcc9d8e5bc1ae65ef9333a8e578ae..22a8d89570b3dd7948115da82a488a017392d8e6 100644
--- a/chapter_recurrent-neural-networks/lang-model-dataset.md
+++ b/chapter_recurrent-neural-networks/lang-model-dataset.md
@@ -56,20 +56,20 @@ print('indices:', sample)
 在随机采样中，每个样本是原始序列上任意截取的一段序列。相邻的两个随机小批量在原始序列上的位置不一定相毗邻。因此，我们无法用一个小批量最终时间步的隐藏状态来初始化下一个小批量的隐藏状态。在训练模型时，每次随机采样前都需要重新初始化隐藏状态。
 
 ```{.python .input  n=25}
-# 本函数已保存在 d2lzh 包中方便以后使用。
+# 本函数已保存在d2lzh包中方便以后使用
 def data_iter_random(corpus_indices, batch_size, num_steps, ctx=None):
-    # 减一是因为输出的索引是相应输入的索引加一。
+    # 减1是因为输出的索引是相应输入的索引加1
     num_examples = (len(corpus_indices) - 1) // num_steps
     epoch_size = num_examples // batch_size
     example_indices = list(range(num_examples))
     random.shuffle(example_indices)
 
-    # 返回从 pos 开始的长为 num_steps 的序列。
+    # 返回从pos开始的长为num_steps的序列
     def _data(pos):
         return corpus_indices[pos: pos + num_steps]
 
     for i in range(epoch_size):
-        # 每次读取 batch_size 个随机样本。
+        # 每次读取batch_size个随机样本
         i = i * batch_size
         batch_indices = example_indices[i: i + batch_size]
         X = [_data(j * num_steps) for j in batch_indices]
@@ -93,7 +93,7 @@ for X, Y in data_iter_random(my_seq, batch_size=2, num_steps=6):
 为了使模型参数的梯度计算只依赖一次迭代读取的小批量序列，我们可以在每次读取小批量前将隐藏状态从计算图分离出来。我们将在后面几节的实现中了解这个处理方式。
 
 ```{.python .input  n=32}
-# 本函数已保存在 d2lzh 包中方便以后使用。
+# 本函数已保存在d2lzh包中方便以后使用
 def data_iter_consecutive(corpus_indices, batch_size, num_steps, ctx=None):
     corpus_indices = nd.array(corpus_indices, ctx=ctx)
     data_len = len(corpus_indices)
diff --git a/chapter_recurrent-neural-networks/lstm.md b/chapter_recurrent-neural-networks/lstm.md
index 571e0f01f5d25dd0b0259d2e5822c362ba8f3fd6..8410a3c7aeb92cad380728e4a6ecdbb156f3dbce 100644
--- a/chapter_recurrent-neural-networks/lstm.md
+++ b/chapter_recurrent-neural-networks/lstm.md
@@ -100,14 +100,14 @@ def get_params():
                 _one((num_hiddens, num_hiddens)),
                 nd.zeros(num_hiddens, ctx=ctx))
 
-    W_xi, W_hi, b_i = _three()  # 输入门参数。
-    W_xf, W_hf, b_f = _three()  # 遗忘门参数。
-    W_xo, W_ho, b_o = _three()  # 输出门参数。
-    W_xc, W_hc, b_c = _three()  # 候选细胞参数。
-    # 输出层参数。
+    W_xi, W_hi, b_i = _three()  # 输入门参数
+    W_xf, W_hf, b_f = _three()  # 遗忘门参数
+    W_xo, W_ho, b_o = _three()  # 输出门参数
+    W_xc, W_hc, b_c = _three()  # 候选记忆细胞参数
+    # 输出层参数
     W_hq = _one((num_hiddens, num_outputs))
     b_q = nd.zeros(num_outputs, ctx=ctx)
-    # 创建梯度。
+    # 附上梯度
     params = [W_xi, W_hi, b_i, W_xf, W_hf, b_f, W_xo, W_ho, b_o, W_xc, W_hc,
               b_c, W_hq, b_q]
     for param in params:
diff --git a/chapter_recurrent-neural-networks/rnn-gluon.md b/chapter_recurrent-neural-networks/rnn-gluon.md
index 1963cd3bbffd8fe6bb150ad5b59bcc31744369f6..ac54499e7437ec8c21728f773f1a9f4ac7319baf 100644
--- a/chapter_recurrent-neural-networks/rnn-gluon.md
+++ b/chapter_recurrent-neural-networks/rnn-gluon.md
@@ -43,7 +43,7 @@ Y.shape, len(state_new), state_new[0].shape
 接下来我们继承Block类来定义一个完整的循环神经网络。它首先将输入数据使用one-hot向量表示后输入到`rnn_layer`中，然后使用全连接输出层得到输出。输出个数等于词典大小`vocab_size`。
 
 ```{.python .input  n=39}
-# 本类已保存在 d2lzh 包中方便以后使用。
+# 本类已保存在d2lzh包中方便以后使用
 class RNNModel(nn.Block):
     def __init__(self, rnn_layer, vocab_size, **kwargs):
         super(RNNModel, self).__init__(**kwargs)
@@ -52,11 +52,11 @@ class RNNModel(nn.Block):
         self.dense = nn.Dense(vocab_size)
 
     def forward(self, inputs, state):
-        # 将输入转置成（num_steps，batch_size）后获取 one-hot 向量表示。
+        # 将输入转置成(num_steps,batch_size)后获取one-hot向量表示
         X = nd.one_hot(inputs.T, self.vocab_size)
         Y, state = self.rnn(X, state)
-        # 全连接层会首先将 Y 的形状变成（num_steps * batch_size，num_hiddens），
-        # 它的输出形状为（num_steps * batch_size，vocab_size）。
+        # 全连接层会首先将Y的形状变成(num_steps * batch_size,num_hiddens)，它的输出
+        # 形状为(num_steps * batch_size,vocab_size)
         output = self.dense(Y.reshape((-1, Y.shape[-1])))
         return output, state
 
@@ -69,15 +69,15 @@ class RNNModel(nn.Block):
 同前一节一样，以下定义了一个预测函数。这里的实现区别在于前向计算和初始化隐藏状态的函数接口。
 
 ```{.python .input  n=41}
-# 本函数已保存在 d2lzh 包中方便以后使用。
+# 本函数已保存在d2lzh包中方便以后使用
 def predict_rnn_gluon(prefix, num_chars, model, vocab_size, ctx, idx_to_char,
                       char_to_idx):
-    # 使用 model 的成员函数来初始化隐藏状态。
+    # 使用model的成员函数来初始化隐藏状态
     state = model.begin_state(batch_size=1, ctx=ctx)
     output = [char_to_idx[prefix[0]]]
     for t in range(num_chars + len(prefix) - 1):
         X = nd.array([output[-1]], ctx=ctx).reshape((1, 1))
-        (Y, state) = model(X, state)  # 前向计算不需要传入模型参数。
+        (Y, state) = model(X, state)  # 前向计算不需要传入模型参数
         if t < len(prefix) - 1:
             output.append(char_to_idx[prefix[t + 1]])
         else:
@@ -97,7 +97,7 @@ predict_rnn_gluon('分开', 10, model, vocab_size, ctx, idx_to_char, char_to_idx
 接下来实现训练函数。它的算法同上一节一样，但这里只使用了相邻采样来读取数据。
 
 ```{.python .input  n=18}
-# 本函数已保存在 d2lzh 包中方便以后使用。
+# 本函数已保存在d2lzh包中方便以后使用
 def train_and_predict_rnn_gluon(model, num_hiddens, vocab_size, ctx,
                                 corpus_indices, idx_to_char, char_to_idx,
                                 num_epochs, num_steps, lr, clipping_theta,
@@ -120,10 +120,10 @@ def train_and_predict_rnn_gluon(model, num_hiddens, vocab_size, ctx,
                 y = Y.T.reshape((-1,))
                 l = loss(output, y).mean()
             l.backward()
-            # 梯度裁剪。
+            # 梯度裁剪
             params = [p.data() for p in model.collect_params().values()]
             d2l.grad_clipping(params, clipping_theta, ctx)
-            trainer.step(1)  # 因为已经误差取过均值，梯度不用再做平均。
+            trainer.step(1)  # 因为已经误差取过均值，梯度不用再做平均
             l_sum += l.asscalar() * y.size
             n += y.size
 
diff --git a/chapter_recurrent-neural-networks/rnn-scratch.md b/chapter_recurrent-neural-networks/rnn-scratch.md
index 540e9006488d48737385e8a8e3fad6914c325d9a..8cbc3addd6b902a9cd7f147d31833eb8db8bf649 100644
--- a/chapter_recurrent-neural-networks/rnn-scratch.md
+++ b/chapter_recurrent-neural-networks/rnn-scratch.md
@@ -24,7 +24,7 @@ nd.one_hot(nd.array([0, 2]), vocab_size)
 我们每次采样的小批量的形状是（批量大小，时间步数）。下面的函数将这样的小批量变换成数个可以输入进网络的形状为（批量大小，词典大小）的矩阵，总数与时间步数相等。也就是说，时间步$t$的输入$\boldsymbol{X}_t \in \mathbb{R}^{n \times d}$，其中$n$为批量大小，$d$为输入个数，即one-hot向量长度（词典大小）。
 
 ```{.python .input  n=3}
-def to_onehot(X, size):  # 本函数已保存在 d2lzh 包中方便以后使用。
+def to_onehot(X, size):  # 本函数已保存在d2lzh包中方便以后使用
     return [nd.one_hot(x, size) for x in X.T]
 
 X = nd.arange(10).reshape((2, 5))
@@ -45,14 +45,14 @@ def get_params():
     def _one(shape):
         return nd.random.normal(scale=0.01, shape=shape, ctx=ctx)
 
-    # 隐藏层参数。
+    # 隐藏层参数
     W_xh = _one((num_inputs, num_hiddens))
     W_hh = _one((num_hiddens, num_hiddens))
     b_h = nd.zeros(num_hiddens, ctx=ctx)
-    # 输出层参数。
+    # 输出层参数
     W_hq = _one((num_hiddens, num_outputs))
     b_q = nd.zeros(num_outputs, ctx=ctx)
-    # 附上梯度。
+    # 附上梯度
     params = [W_xh, W_hh, b_h, W_hq, b_q]
     for param in params:
         param.attach_grad()
@@ -72,7 +72,7 @@ def init_rnn_state(batch_size, num_hiddens, ctx):
 
 ```{.python .input  n=6}
 def rnn(inputs, state, params):
-    # inputs 和 outputs 皆为 num_steps 个形状为（batch_size，vocab_size）的矩阵。
+    # inputs和outputs皆为num_steps个形状为(batch_size,vocab_size)的矩阵
     W_xh, W_hh, b_h, W_hq, b_q = params
     H, = state
     outputs = []
@@ -98,17 +98,17 @@ len(outputs), outputs[0].shape, state_new[0].shape
 以下函数基于前缀`prefix`（含有数个字符的字符串）来预测接下来的`num_chars`个字符。这个函数稍显复杂，其中我们将循环神经单元`rnn`设置成了函数参数，这样在后面小节介绍其他循环神经网络时能重复使用这个函数。
 
 ```{.python .input  n=8}
-# 本函数已保存在 d2lzh 包中方便以后使用。
+# 本函数已保存在d2lzh包中方便以后使用
 def predict_rnn(prefix, num_chars, rnn, params, init_rnn_state,
                 num_hiddens, vocab_size, ctx, idx_to_char, char_to_idx):
     state = init_rnn_state(1, num_hiddens, ctx)
     output = [char_to_idx[prefix[0]]]
     for t in range(num_chars + len(prefix) - 1):
-        # 将上一时间步的输出作为当前时间步的输入。
+        # 将上一时间步的输出作为当前时间步的输入
         X = to_onehot(nd.array([output[-1]], ctx=ctx), vocab_size)
-        # 计算输出和更新隐藏状态。
+        # 计算输出和更新隐藏状态
         (Y, state) = rnn(X, state, params)
-        # 下一个时间步的输入是 prefix 里的字符或者当前的最佳预测字符。
+        # 下一个时间步的输入是prefix里的字符或者当前的最佳预测字符
         if t < len(prefix) - 1:
             output.append(char_to_idx[prefix[t + 1]])
         else:
@@ -132,7 +132,7 @@ $$ \min\left(\frac{\theta}{\|\boldsymbol{g}\|}, 1\right)\boldsymbol{g}$$
 的$L_2$范数不超过$\theta$。
 
 ```{.python .input  n=10}
-# 本函数已保存在 d2lzh 包中方便以后使用。
+# 本函数已保存在d2lzh包中方便以后使用
 def grad_clipping(params, theta, ctx):
     norm = nd.array([0], ctx)
     for param in params:
@@ -164,7 +164,7 @@ def grad_clipping(params, theta, ctx):
 另外，考虑到后面将介绍的其它循环神经网络，为了更通用，这里的函数实现更长一些。
 
 ```{.python .input  n=11}
-# 本函数已保存在 d2lzh 包中方便以后使用。
+# 本函数已保存在d2lzh包中方便以后使用
 def train_and_predict_rnn(rnn, get_params, init_rnn_state, num_hiddens,
                           vocab_size, ctx, corpus_indices, idx_to_char,
                           char_to_idx, is_random_iter, num_epochs, num_steps,
@@ -178,30 +178,30 @@ def train_and_predict_rnn(rnn, get_params, init_rnn_state, num_hiddens,
     loss = gloss.SoftmaxCrossEntropyLoss()
 
     for epoch in range(num_epochs):
-        if not is_random_iter:  # 如使用相邻采样，在 epoch 开始时初始化隐藏状态。
+        if not is_random_iter:  # 如使用相邻采样，在epoch开始时初始化隐藏状态
             state = init_rnn_state(batch_size, num_hiddens, ctx)
         l_sum, n, start = 0.0, 0, time.time()
         data_iter = data_iter_fn(corpus_indices, batch_size, num_steps, ctx)
         for X, Y in data_iter:
-            if is_random_iter:  # 如使用随机采样，在每个小批量更新前初始化隐藏状态。
+            if is_random_iter:  # 如使用随机采样，在每个小批量更新前初始化隐藏状态
                 state = init_rnn_state(batch_size, num_hiddens, ctx)
-            else:  # 否则需要使用 detach 函数从计算图分离隐藏状态。
+            else:  # 否则需要使用detach函数从计算图分离隐藏状态
                 for s in state:
                     s.detach()
             with autograd.record():
                 inputs = to_onehot(X, vocab_size)
-                # outputs 有 num_steps 个形状为（batch_size，vocab_size）的矩阵。
+                # outputs有num_steps个形状为(batch_size,vocab_size)的矩阵
                 (outputs, state) = rnn(inputs, state, params)
-                # 拼接之后形状为（num_steps * batch_size，vocab_size）。
+                # 拼接之后形状为(num_steps * batch_size,vocab_size)
                 outputs = nd.concat(*outputs, dim=0)
-                # Y 的形状是（batch_size，num_steps），转置后再变成长度为
-                # batch * num_steps 的向量，这样跟输出的行一一对应。
+                # Y的形状是(batch_size,num_steps)，转置后再变成长度为
+                # batch * num_steps 的向量，这样跟输出的行一一对应
                 y = Y.T.reshape((-1,))
-                # 使用交叉熵损失计算平均分类误差。
+                # 使用交叉熵损失计算平均分类误差
                 l = loss(outputs, y).mean()
             l.backward()
-            grad_clipping(params, clipping_theta, ctx)  # 裁剪梯度。
-            d2l.sgd(params, lr, 1)  # 因为误差已经取过均值，梯度不用再做平均。
+            grad_clipping(params, clipping_theta, ctx)  # 裁剪梯度
+            d2l.sgd(params, lr, 1)  # 因为误差已经取过均值，梯度不用再做平均
             l_sum += l.asscalar() * y.size
             n += y.size