From c05f68b28fb044310af41809b751abf4ac28346e Mon Sep 17 00:00:00 2001 From: guoshengCS Date: Tue, 19 Feb 2019 19:41:38 +0800 Subject: [PATCH] Fix book-nmt README --- 08.machine_translation/README.cn.md | 65 +++++++++++++++------------- 08.machine_translation/index.cn.html | 65 +++++++++++++++------------- 2 files changed, 72 insertions(+), 58 deletions(-) diff --git a/08.machine_translation/README.cn.md b/08.machine_translation/README.cn.md index 1244f94..5b91c74 100644 --- a/08.machine_translation/README.cn.md +++ b/08.machine_translation/README.cn.md @@ -54,9 +54,9 @@ ### 编码器-解码器框架 编码器-解码器(Encoder-Decoder)\[[2](#参考文献)\]框架用于解决由一个任意长度的源序列到另一个任意长度的目标序列的变换问题。即编码阶段将整个源序列编码成一个向量,解码阶段通过最大化预测序列概率,从中解码出整个目标序列。编码和解码的过程通常都使用RNN实现。 -![encoder_decoder](./image/encoder_decoder.png) +
-
+
图3. 编码器-解码器框架
@@ -82,7 +82,7 @@ 机器翻译任务的训练过程中,解码阶段的目标是最大化下一个正确的目标语言词的概率。思路是: 1. 每一个时刻,根据源语言句子的编码信息(又叫上下文向量,context vector)$c$、真实目标语言序列的第$i$个词$u_i$和$i$时刻RNN的隐层状态$z_i$,计算出下一个隐层状态$z_{i+1}$。计算公式如下: $$z_{i+1}=\phi_{\theta '} \left ( c,u_i,z_i \right )$$ -其中$\phi _{\theta '}$是一个非线性激活函数;$c=q\mathbf{h}$是源语言句子的上下文向量,在不使用注意力机制时,如果[编码器](#编码器)的输出是源语言句子编码后的最后一个元素,则可以定义$c=h_T$;$u_i$是目标语言序列的第$i$个单词,$u_0$是目标语言序列的开始标记``,表示解码开始;$z_i$是$i$时刻解码RNN的隐层状态,$z_0$是一个全零的向量。 +其中$\phi _{\theta '}$是一个非线性激活函数;$c$是源语言句子的上下文向量,在不使用注意力机制时,如果[编码器](#编码器)的输出是源语言句子编码后的最后一个元素,则可以定义$c=h_T$;$u_i$是目标语言序列的第$i$个单词,$u_0$是目标语言序列的开始标记``,表示解码开始;$z_i$是$i$时刻解码RNN的隐层状态,$z_0$是一个全零的向量。 2. 将$z_{i+1}$通过`softmax`归一化,得到目标语言序列的第$i+1$个单词的概率分布$p_{i+1}$。概率分布公式如下: $$p\left ( u_{i+1}|u_{<i+1},\mathbf{x} \right )=softmax(W_sz_{i+1}+b_z)$$ @@ -160,33 +160,35 @@ except ImportError: from paddle.fluid.trainer import * from paddle.fluid.inferencer import * -dict_size = 30000 -source_dict_dim = target_dict_dim = dict_size -hidden_dim = 32 -word_dim = 16 -batch_size = 2 -max_length = 8 -topk_size = 50 -beam_size = 2 +dict_size = 30000 # 字典维度 +source_dict_dim = target_dict_dim = dict_size # 源/目标语言字典维度 +hidden_dim = 32 # 编码器中的隐层大小 +word_dim = 16 # 词向量维度 +batch_size = 2 # batch 中的样本数 +max_length = 8 # 生成句子的最大长度 +beam_size = 2 # 柱宽度 -decoder_size = hidden_dim +decoder_size = hidden_dim # 解码器中的隐层大小 ``` 然后如下实现编码器框架: ```python def encoder(is_sparse): + # 定义源语言id序列的输入数据 src_word_id = pd.data( name="src_word_id", shape=[1], dtype='int64', lod_level=1) + # 将上述编码映射到低维语言空间的词向量 src_embedding = pd.embedding( input=src_word_id, size=[dict_size, word_dim], dtype='float32', is_sparse=is_sparse, param_attr=fluid.ParamAttr(name='vemb')) - + # LSTM层:fc + dynamic_lstm fc1 = pd.fc(input=src_embedding, size=hidden_dim * 4, act='tanh') lstm_hidden0, lstm_0 = pd.dynamic_lstm(input=fc1, size=hidden_dim * 4) + # 取源语言序列编码后的最后一个状态 encoder_out = pd.sequence_last_step(input=lstm_hidden0) return encoder_out ``` @@ -195,6 +197,7 @@ decoder_size = hidden_dim ```python def train_decoder(context, is_sparse): + # 定义目标语言id序列的输入数据,并映射到低维语言空间的词向量 trg_language_word = pd.data( name="target_language_word", shape=[1], dtype='int64', lod_level=1) trg_embedding = pd.embedding( @@ -205,17 +208,22 @@ decoder_size = hidden_dim param_attr=fluid.ParamAttr(name='vemb')) rnn = pd.DynamicRNN() - with rnn.block(): + with rnn.block(): # 使用 DynamicRNN 定义每一步的计算 + # 获取当前步目标语言输入的词向量 current_word = rnn.step_input(trg_embedding) + # 获取隐层状态 pre_state = rnn.memory(init=context) + # 解码器计算单元:单层前馈网络 current_state = pd.fc(input=[current_word, pre_state], size=decoder_size, act='tanh') - + # 计算归一化的单词预测概率 current_score = pd.fc(input=current_state, size=target_dict_dim, act='softmax') + # 更新RNN的隐层状态 rnn.update_memory(pre_state, current_state) + # 输出预测概率 rnn.output(current_score) return rnn() @@ -226,14 +234,14 @@ decoder_size = hidden_dim ```python def decode(context, is_sparse): init_state = context + # 定义解码过程循环计数变量 array_len = pd.fill_constant(shape=[1], dtype='int64', value=max_length) counter = pd.zeros(shape=[1], dtype='int64', force_cpu=True) - # fill the first element with init_state + # 定义 tensor array 用以保存各个时间步的内容,并写入初始id,score和state state_array = pd.create_array('float32') pd.array_write(init_state, array=state_array, i=counter) - # ids, scores as memory ids_array = pd.create_array('int64') scores_array = pd.create_array('float32') @@ -244,34 +252,35 @@ def decode(context, is_sparse): pd.array_write(init_ids, array=ids_array, i=counter) pd.array_write(init_scores, array=scores_array, i=counter) + # 定义循环终止条件变量 cond = pd.less_than(x=counter, y=array_len) - + # 定义 while_op while_op = pd.While(cond=cond) - with while_op.block(): + with while_op.block(): # 定义每一步的计算 + # 获取解码器在当前步的输入,包括上一步选择的id,对应的score和上一步的state pre_ids = pd.array_read(array=ids_array, i=counter) pre_state = pd.array_read(array=state_array, i=counter) pre_score = pd.array_read(array=scores_array, i=counter) - # expand the lod of pre_state to be the same with pre_score + # 更新输入的state为上一步选择id对应的state pre_state_expanded = pd.sequence_expand(pre_state, pre_score) - + # 同训练模式下解码器中的计算逻辑,包括获取输入向量,解码器计算单元计算和 + # 归一化单词预测概率的计算 pre_ids_emb = pd.embedding( input=pre_ids, size=[dict_size, word_dim], dtype='float32', is_sparse=is_sparse) - - # use rnn unit to update rnn current_state = pd.fc(input=[pre_state_expanded, pre_ids_emb], size=decoder_size, act='tanh') current_state_with_lod = pd.lod_reset(x=current_state, y=pre_score) - # use score to do beam search current_score = pd.fc(input=current_state_with_lod, size=target_dict_dim, act='softmax') topk_scores, topk_indices = pd.topk(current_score, k=beam_size) - # calculate accumulated scores after topk to reduce computation cost + + # 计算累计得分,进行beam search accu_scores = pd.elementwise_add( x=pd.log(topk_scores), y=pd.reshape(pre_score, shape=[-1]), axis=0) selected_ids, selected_scores = pd.beam_search( @@ -284,14 +293,12 @@ def decode(context, is_sparse): level=0) pd.increment(x=counter, value=1, in_place=True) - - # update the memories + # 将 search 结果和对应的隐层状态写入 tensor array 中 pd.array_write(current_state, array=state_array, i=counter) pd.array_write(selected_ids, array=ids_array, i=counter) pd.array_write(selected_scores, array=scores_array, i=counter) - # update the break condition: up to the max length or all candidates of - # source sentences have ended. + # 更新循环终止条件 length_cond = pd.less_than(x=counter, y=array_len) finish_cond = pd.logical_not(pd.is_empty(x=selected_ids)) pd.logical_and(x=length_cond, y=finish_cond, out=cond) diff --git a/08.machine_translation/index.cn.html b/08.machine_translation/index.cn.html index 453cc67..849f877 100644 --- a/08.machine_translation/index.cn.html +++ b/08.machine_translation/index.cn.html @@ -96,9 +96,9 @@ ### 编码器-解码器框架 编码器-解码器(Encoder-Decoder)\[[2](#参考文献)\]框架用于解决由一个任意长度的源序列到另一个任意长度的目标序列的变换问题。即编码阶段将整个源序列编码成一个向量,解码阶段通过最大化预测序列概率,从中解码出整个目标序列。编码和解码的过程通常都使用RNN实现。 -![encoder_decoder](./image/encoder_decoder.png) +
-
+
图3. 编码器-解码器框架
@@ -124,7 +124,7 @@ 机器翻译任务的训练过程中,解码阶段的目标是最大化下一个正确的目标语言词的概率。思路是: 1. 每一个时刻,根据源语言句子的编码信息(又叫上下文向量,context vector)$c$、真实目标语言序列的第$i$个词$u_i$和$i$时刻RNN的隐层状态$z_i$,计算出下一个隐层状态$z_{i+1}$。计算公式如下: $$z_{i+1}=\phi_{\theta '} \left ( c,u_i,z_i \right )$$ -其中$\phi _{\theta '}$是一个非线性激活函数;$c=q\mathbf{h}$是源语言句子的上下文向量,在不使用注意力机制时,如果[编码器](#编码器)的输出是源语言句子编码后的最后一个元素,则可以定义$c=h_T$;$u_i$是目标语言序列的第$i$个单词,$u_0$是目标语言序列的开始标记``,表示解码开始;$z_i$是$i$时刻解码RNN的隐层状态,$z_0$是一个全零的向量。 +其中$\phi _{\theta '}$是一个非线性激活函数;$c$是源语言句子的上下文向量,在不使用注意力机制时,如果[编码器](#编码器)的输出是源语言句子编码后的最后一个元素,则可以定义$c=h_T$;$u_i$是目标语言序列的第$i$个单词,$u_0$是目标语言序列的开始标记``,表示解码开始;$z_i$是$i$时刻解码RNN的隐层状态,$z_0$是一个全零的向量。 2. 将$z_{i+1}$通过`softmax`归一化,得到目标语言序列的第$i+1$个单词的概率分布$p_{i+1}$。概率分布公式如下: $$p\left ( u_{i+1}|u_{<i+1},\mathbf{x} \right )=softmax(W_sz_{i+1}+b_z)$$ @@ -202,33 +202,35 @@ except ImportError: from paddle.fluid.trainer import * from paddle.fluid.inferencer import * -dict_size = 30000 -source_dict_dim = target_dict_dim = dict_size -hidden_dim = 32 -word_dim = 16 -batch_size = 2 -max_length = 8 -topk_size = 50 -beam_size = 2 +dict_size = 30000 # 字典维度 +source_dict_dim = target_dict_dim = dict_size # 源/目标语言字典维度 +hidden_dim = 32 # 编码器中的隐层大小 +word_dim = 16 # 词向量维度 +batch_size = 2 # batch 中的样本数 +max_length = 8 # 生成句子的最大长度 +beam_size = 2 # 柱宽度 -decoder_size = hidden_dim +decoder_size = hidden_dim # 解码器中的隐层大小 ``` 然后如下实现编码器框架: ```python def encoder(is_sparse): + # 定义源语言id序列的输入数据 src_word_id = pd.data( name="src_word_id", shape=[1], dtype='int64', lod_level=1) + # 将上述编码映射到低维语言空间的词向量 src_embedding = pd.embedding( input=src_word_id, size=[dict_size, word_dim], dtype='float32', is_sparse=is_sparse, param_attr=fluid.ParamAttr(name='vemb')) - + # LSTM层:fc + dynamic_lstm fc1 = pd.fc(input=src_embedding, size=hidden_dim * 4, act='tanh') lstm_hidden0, lstm_0 = pd.dynamic_lstm(input=fc1, size=hidden_dim * 4) + # 取源语言序列编码后的最后一个状态 encoder_out = pd.sequence_last_step(input=lstm_hidden0) return encoder_out ``` @@ -237,6 +239,7 @@ decoder_size = hidden_dim ```python def train_decoder(context, is_sparse): + # 定义目标语言id序列的输入数据,并映射到低维语言空间的词向量 trg_language_word = pd.data( name="target_language_word", shape=[1], dtype='int64', lod_level=1) trg_embedding = pd.embedding( @@ -247,17 +250,22 @@ decoder_size = hidden_dim param_attr=fluid.ParamAttr(name='vemb')) rnn = pd.DynamicRNN() - with rnn.block(): + with rnn.block(): # 使用 DynamicRNN 定义每一步的计算 + # 获取当前步目标语言输入的词向量 current_word = rnn.step_input(trg_embedding) + # 获取隐层状态 pre_state = rnn.memory(init=context) + # 解码器计算单元:单层前馈网络 current_state = pd.fc(input=[current_word, pre_state], size=decoder_size, act='tanh') - + # 计算归一化的单词预测概率 current_score = pd.fc(input=current_state, size=target_dict_dim, act='softmax') + # 更新RNN的隐层状态 rnn.update_memory(pre_state, current_state) + # 输出预测概率 rnn.output(current_score) return rnn() @@ -268,14 +276,14 @@ decoder_size = hidden_dim ```python def decode(context, is_sparse): init_state = context + # 定义解码过程循环计数变量 array_len = pd.fill_constant(shape=[1], dtype='int64', value=max_length) counter = pd.zeros(shape=[1], dtype='int64', force_cpu=True) - # fill the first element with init_state + # 定义 tensor array 用以保存各个时间步的内容,并写入初始id,score和state state_array = pd.create_array('float32') pd.array_write(init_state, array=state_array, i=counter) - # ids, scores as memory ids_array = pd.create_array('int64') scores_array = pd.create_array('float32') @@ -286,34 +294,35 @@ def decode(context, is_sparse): pd.array_write(init_ids, array=ids_array, i=counter) pd.array_write(init_scores, array=scores_array, i=counter) + # 定义循环终止条件变量 cond = pd.less_than(x=counter, y=array_len) - + # 定义 while_op while_op = pd.While(cond=cond) - with while_op.block(): + with while_op.block(): # 定义每一步的计算 + # 获取解码器在当前步的输入,包括上一步选择的id,对应的score和上一步的state pre_ids = pd.array_read(array=ids_array, i=counter) pre_state = pd.array_read(array=state_array, i=counter) pre_score = pd.array_read(array=scores_array, i=counter) - # expand the lod of pre_state to be the same with pre_score + # 更新输入的state为上一步选择id对应的state pre_state_expanded = pd.sequence_expand(pre_state, pre_score) - + # 同训练模式下解码器中的计算逻辑,包括获取输入向量,解码器计算单元计算和 + # 归一化单词预测概率的计算 pre_ids_emb = pd.embedding( input=pre_ids, size=[dict_size, word_dim], dtype='float32', is_sparse=is_sparse) - - # use rnn unit to update rnn current_state = pd.fc(input=[pre_state_expanded, pre_ids_emb], size=decoder_size, act='tanh') current_state_with_lod = pd.lod_reset(x=current_state, y=pre_score) - # use score to do beam search current_score = pd.fc(input=current_state_with_lod, size=target_dict_dim, act='softmax') topk_scores, topk_indices = pd.topk(current_score, k=beam_size) - # calculate accumulated scores after topk to reduce computation cost + + # 计算累计得分,进行beam search accu_scores = pd.elementwise_add( x=pd.log(topk_scores), y=pd.reshape(pre_score, shape=[-1]), axis=0) selected_ids, selected_scores = pd.beam_search( @@ -326,14 +335,12 @@ def decode(context, is_sparse): level=0) pd.increment(x=counter, value=1, in_place=True) - - # update the memories + # 将 search 结果和对应的隐层状态写入 tensor array 中 pd.array_write(current_state, array=state_array, i=counter) pd.array_write(selected_ids, array=ids_array, i=counter) pd.array_write(selected_scores, array=scores_array, i=counter) - # update the break condition: up to the max length or all candidates of - # source sentences have ended. + # 更新循环终止条件 length_cond = pd.less_than(x=counter, y=array_len) finish_cond = pd.logical_not(pd.is_empty(x=selected_ids)) pd.logical_and(x=length_cond, y=finish_cond, out=cond) -- GitLab