From c93012a99abde89caca489a9660083e530dfaa96 Mon Sep 17 00:00:00 2001
From: ranqiu <ranqiu@baidu.com>
Date: Tue, 7 Nov 2017 19:34:36 +0800
Subject: [PATCH] Refine conv_seq_to_seq

---
 conv_seq_to_seq/infer.py  |   6 +--
 conv_seq_to_seq/model.py  | 101 +++++++++++++++++++-------------------
 conv_seq_to_seq/reader.py |   6 +--
 conv_seq_to_seq/train.py  |   2 +-
 4 files changed, 58 insertions(+), 57 deletions(-)

diff --git a/conv_seq_to_seq/infer.py b/conv_seq_to_seq/infer.py
index cba94e53..eb46df55 100644
--- a/conv_seq_to_seq/infer.py
+++ b/conv_seq_to_seq/infer.py
@@ -68,12 +68,12 @@ def parse_args():
         "--beam_size",
         default=1,
         type=int,
-        help="Beam search width. (default: %(default)s)")
+        help="The width of beam expasion. (default: %(default)s)")
     parser.add_argument(
         "--model_path",
         type=str,
         required=True,
-        help="Model path. (default: %(default)s)")
+        help="The path of trained model. (default: %(default)s)")
     return parser.parse_args()
 
 
@@ -122,7 +122,7 @@ def infer(infer_data_path,
     :type drop_rate: float
     :param max_len: The maximum length of the sentence to be generated.
     :type max_len: int
-    :param beam_size: The width of beam search.
+    :param beam_size: The width of beam expansion.
     :type beam_size: int
     """
     # load dict
diff --git a/conv_seq_to_seq/model.py b/conv_seq_to_seq/model.py
index e8e689e9..01dd9428 100644
--- a/conv_seq_to_seq/model.py
+++ b/conv_seq_to_seq/model.py
@@ -20,7 +20,7 @@ def gated_conv_with_batchnorm(input,
     :type input: LayerOutput
     :param size: The dimension of the block's output.
     :type size: int
-    :param context_len: The context width of the convolution.
+    :param context_len: The context length of the convolution.
     :type context_len: int
     :param context_start: The start position of the context.
     :type context_start: int
@@ -81,9 +81,9 @@ def encoder(token_emb,
     :type token_emb: LayerOutput
     :param pos_emb: The embedding vector of the input token's position.
     :type pos_emb: LayerOutput
-    :param conv_blocks: The scale list of the convolution blocks. And each element of the
-                        list contains output dimension and context length of the corresponding
-                        convolution block.
+    :param conv_blocks: The scale list of the convolution blocks. Each element of
+                        the list contains output dimension and context length of
+                        the corresponding convolution block.
     :type conv_blocks: list of tuple
     :param num_attention: The total number of the attention modules used in the decoder.
     :type num_attention: int
@@ -109,9 +109,9 @@ def encoder(token_emb,
 
     for (size, context_len) in conv_blocks:
         if block_input.size == size:
-            res = block_input
+            residual = block_input
         else:
-            res = paddle.layer.fc(
+            residual = paddle.layer.fc(
                 input=block_input,
                 size=size,
                 act=paddle.activation.Linear(),
@@ -127,9 +127,10 @@ def encoder(token_emb,
             drop_rate=drop_rate)
 
         with paddle.layer.mixed(size=size) as block_output:
-            block_output += paddle.layer.identity_projection(res)
+            block_output += paddle.layer.identity_projection(residual)
             block_output += paddle.layer.identity_projection(gated_conv)
 
+        # halve the variance of the sum
         block_output = paddle.layer.slope_intercept(
             input=block_output, slope=math.sqrt(0.5))
 
@@ -143,14 +144,15 @@ def encoder(token_emb,
         param_attr=paddle.attr.Param(learning_rate=1.0 / (2.0 * num_attention)),
         bias_attr=True)
 
-    encoded = paddle.layer.addto(input=[encoded_vec, embedding])
+    encoded_sum = paddle.layer.addto(input=[encoded_vec, embedding])
 
-    encoded = paddle.layer.slope_intercept(input=encoded, slope=math.sqrt(0.5))
+    # halve the variance of the sum
+    encoded_sum = paddle.layer.slope_intercept(input=encoded_sum, slope=math.sqrt(0.5))
 
-    return encoded_vec, encoded
+    return encoded_vec, encoded_sum
 
 
-def attention(decoder_state, cur_embedding, encoded_vec, encoded):
+def attention(decoder_state, cur_embedding, encoded_vec, encoded_sum):
     """
     Definition of the attention.
 
@@ -160,12 +162,12 @@ def attention(decoder_state, cur_embedding, encoded_vec, encoded):
     :type cur_embedding: LayerOutput
     :param encoded_vec: The source token encoding.
     :type encoded_vec: LayerOutput
-    :param encoded: The sum of the source token's encoding and embedding.
-    :type encoded: LayerOutput
+    :param encoded_sum: The sum of the source token's encoding and embedding.
+    :type encoded_sum: LayerOutput
     :return: A context vector.
     :rtype: LayerOutput
     """
-    res = decoder_state
+    residual = decoder_state
 
     state_size = decoder_state.size
     emb_dim = cur_embedding.size
@@ -173,6 +175,7 @@ def attention(decoder_state, cur_embedding, encoded_vec, encoded):
         state_summary += paddle.layer.full_matrix_projection(decoder_state)
         state_summary += paddle.layer.identity_projection(cur_embedding)
 
+    # halve the variance of the sum
     state_summary = paddle.layer.slope_intercept(
         input=state_summary, slope=math.sqrt(0.5))
 
@@ -186,7 +189,7 @@ def attention(decoder_state, cur_embedding, encoded_vec, encoded):
         act=paddle.activation.SequenceSoftmax(),
         bias_attr=False)
 
-    scaled = paddle.layer.scaling(weight=attention_weight, input=encoded)
+    scaled = paddle.layer.scaling(weight=attention_weight, input=encoded_sum)
 
     attended = paddle.layer.pooling(
         input=scaled, pooling_type=paddle.pooling.Sum())
@@ -197,9 +200,9 @@ def attention(decoder_state, cur_embedding, encoded_vec, encoded):
         act=paddle.activation.Linear(),
         bias_attr=True)
 
-    # TODO scaled by length
+    attention_result = paddle.layer.addto(input=[attended_proj, residual])
 
-    attention_result = paddle.layer.addto(input=[attended_proj, res])
+    # halve the variance of the sum
     attention_result = paddle.layer.slope_intercept(
         input=attention_result, slope=math.sqrt(0.5))
     return attention_result
@@ -208,7 +211,7 @@ def attention(decoder_state, cur_embedding, encoded_vec, encoded):
 def decoder(token_emb,
             pos_emb,
             encoded_vec,
-            encoded,
+            encoded_sum,
             dict_size,
             conv_blocks=[(256, 3)] * 3,
             drop_rate=0.1):
@@ -221,13 +224,13 @@ def decoder(token_emb,
     :type pos_emb: LayerOutput
     :param encoded_vec: The source token encoding.
     :type encoded_vec: LayerOutput
-    :param encoded: The sum of the source token's encoding and embedding.
-    :type encoded: LayerOutput
+    :param encoded_sum: The sum of the source token's encoding and embedding.
+    :type encoded_sum: LayerOutput
     :param dict_size: The size of the target dictionary.
     :type dict_size: int
-    :param conv_blocks: The scale list of the convolution blocks. And each element of the
-                        list contains output dimension and context length of the corresponding
-                        convolution block.
+    :param conv_blocks: The scale list of the convolution blocks. Each element
+                        of the list contains output dimension and context length
+                        of the corresponding convolution block.
     :type conv_blocks: list of tuple
     :param drop_rate: Dropout rate.
     :type drop_rate: float
@@ -235,24 +238,14 @@ def decoder(token_emb,
     :rtype: LayerOutput
     """
 
-    def attention_step(decoder_state, cur_embedding, encoded_vec, encoded):
+    def attention_step(decoder_state, cur_embedding, encoded_vec, encoded_sum):
         conditional = attention(
             decoder_state=decoder_state,
             cur_embedding=cur_embedding,
             encoded_vec=encoded_vec,
-            encoded=encoded)
+            encoded_sum=encoded_sum)
         return conditional
 
-    def softmax_step(input):
-        return paddle.layer.fc(
-            input=input,
-            size=dict_size,
-            act=paddle.activation.Softmax(),
-            param_attr=paddle.attr.Param(
-                initial_mean=0.,
-                initial_std=math.sqrt((1.0 - drop_rate) / input.size)),
-            bias_attr=True, )
-
     embedding = paddle.layer.addto(
         input=[token_emb, pos_emb],
         layer_attr=paddle.attr.Extra(drop_rate=drop_rate))
@@ -269,9 +262,9 @@ def decoder(token_emb,
 
     for (size, context_len) in conv_blocks:
         if block_input.size == size:
-            res = block_input
+            residual = block_input
         else:
-            res = paddle.layer.fc(
+            residual = paddle.layer.fc(
                 input=block_input,
                 size=size,
                 act=paddle.activation.Linear(),
@@ -288,13 +281,15 @@ def decoder(token_emb,
             decoder_state,
             embedding,
             paddle.layer.StaticInput(input=encoded_vec),
-            paddle.layer.StaticInput(input=encoded),
+            paddle.layer.StaticInput(input=encoded_sum),
         ]
 
         conditional = paddle.layer.recurrent_group(
             step=attention_step, input=group_inputs)
 
-        block_output = paddle.layer.addto(input=[conditional, res])
+        block_output = paddle.layer.addto(input=[conditional, residual])
+
+        # halve the variance of the sum
         block_output = paddle.layer.slope_intercept(
             input=block_output, slope=math.sqrt(0.5))
 
@@ -307,8 +302,14 @@ def decoder(token_emb,
         act=paddle.activation.Linear(),
         layer_attr=paddle.attr.Extra(drop_rate=drop_rate))
 
-    decoder_out = paddle.layer.recurrent_group(
-        step=softmax_step, input=[block_output])
+    decoder_out = paddle.layer.fc(
+        input=block_output,
+        size=dict_size,
+        act=paddle.activation.Softmax(),
+        param_attr=paddle.attr.Param(
+            initial_mean=0.,
+            initial_std=math.sqrt((1.0 - drop_rate) / block_output.size)),
+        bias_attr=True)
 
     return decoder_out
 
@@ -333,13 +334,13 @@ def conv_seq2seq(src_dict_size,
     :type pos_size: int
     :param emb_dim: The dimension of the embedding vector.
     :type emb_dim: int
-    :param enc_conv_blocks: The scale list of the encoder's convolution blocks. And each element of
-                            the list contains output dimension and context length of the corresponding
-                            convolution block.
+    :param enc_conv_blocks: The scale list of the encoder's convolution blocks. Each element
+                            of the list contains output dimension and context length of the
+                            corresponding convolution block.
     :type enc_conv_blocks: list of tuple
-    :param dec_conv_blocks: The scale list of the decoder's convolution blocks. And each element of
-                            the list contains output dimension and context length of the corresponding
-                            convolution block.
+    :param dec_conv_blocks: The scale list of the decoder's convolution blocks. Each element
+                            of the list contains output dimension and context length of the
+                            corresponding convolution block.
     :type dec_conv_blocks: list of tuple
     :param drop_rate: Dropout rate.
     :type drop_rate: float
@@ -368,7 +369,7 @@ def conv_seq2seq(src_dict_size,
         param_attr=paddle.attr.Param(initial_mean=0., initial_std=0.1))
 
     num_attention = len(dec_conv_blocks)
-    encoded_vec, encoded = encoder(
+    encoded_vec, encoded_sum = encoder(
         token_emb=src_emb,
         pos_emb=src_pos_emb,
         conv_blocks=enc_conv_blocks,
@@ -399,7 +400,7 @@ def conv_seq2seq(src_dict_size,
         token_emb=trg_emb,
         pos_emb=trg_pos_emb,
         encoded_vec=encoded_vec,
-        encoded=encoded,
+        encoded_sum=encoded_sum,
         dict_size=trg_dict_size,
         conv_blocks=dec_conv_blocks,
         drop_rate=drop_rate)
@@ -413,4 +414,4 @@ def conv_seq2seq(src_dict_size,
     cost = paddle.layer.classification_cost(
         input=decoder_out, label=trg_next_word)
 
-    return cost
\ No newline at end of file
+    return cost
diff --git a/conv_seq_to_seq/reader.py b/conv_seq_to_seq/reader.py
index 1105fdc5..6d4db49f 100644
--- a/conv_seq_to_seq/reader.py
+++ b/conv_seq_to_seq/reader.py
@@ -18,7 +18,7 @@ def get_reverse_dict(dictionary):
 
 
 def load_data(data_file, src_dict, trg_dict):
-    UNK_IDX = src_dict['UNK']
+    UNK_IDX = src_dict['<unk>']
     with open(data_file, 'r') as f:
         for line in f:
             line_split = line.strip().split('\t')
@@ -34,7 +34,7 @@ def load_data(data_file, src_dict, trg_dict):
 
 def data_reader(data_file, src_dict, trg_dict, pos_size, padding_num):
     def reader():
-        UNK_IDX = src_dict['UNK']
+        UNK_IDX = src_dict['<unk>']
         word_padding = trg_dict.__len__()
         pos_padding = pos_size
 
@@ -64,4 +64,4 @@ def data_reader(data_file, src_dict, trg_dict, pos_size, padding_num):
                 trg_next_word = trg_next_word + [trg_dict['<e>']] * padding_num
                 yield src_word, src_word_pos, trg_word, trg_word_pos, trg_next_word
 
-    return reader
\ No newline at end of file
+    return reader
diff --git a/conv_seq_to_seq/train.py b/conv_seq_to_seq/train.py
index b86fed67..c6ce0dff 100644
--- a/conv_seq_to_seq/train.py
+++ b/conv_seq_to_seq/train.py
@@ -249,4 +249,4 @@ def main():
 
 
 if __name__ == '__main__':
-    main()
\ No newline at end of file
+    main()
-- 
GitLab