提交 c93012a9 编写于 作者: R ranqiu

Refine conv_seq_to_seq

上级 578f4099
...@@ -68,12 +68,12 @@ def parse_args(): ...@@ -68,12 +68,12 @@ def parse_args():
"--beam_size", "--beam_size",
default=1, default=1,
type=int, type=int,
help="Beam search width. (default: %(default)s)") help="The width of beam expasion. (default: %(default)s)")
parser.add_argument( parser.add_argument(
"--model_path", "--model_path",
type=str, type=str,
required=True, required=True,
help="Model path. (default: %(default)s)") help="The path of trained model. (default: %(default)s)")
return parser.parse_args() return parser.parse_args()
...@@ -122,7 +122,7 @@ def infer(infer_data_path, ...@@ -122,7 +122,7 @@ def infer(infer_data_path,
:type drop_rate: float :type drop_rate: float
:param max_len: The maximum length of the sentence to be generated. :param max_len: The maximum length of the sentence to be generated.
:type max_len: int :type max_len: int
:param beam_size: The width of beam search. :param beam_size: The width of beam expansion.
:type beam_size: int :type beam_size: int
""" """
# load dict # load dict
......
...@@ -20,7 +20,7 @@ def gated_conv_with_batchnorm(input, ...@@ -20,7 +20,7 @@ def gated_conv_with_batchnorm(input,
:type input: LayerOutput :type input: LayerOutput
:param size: The dimension of the block's output. :param size: The dimension of the block's output.
:type size: int :type size: int
:param context_len: The context width of the convolution. :param context_len: The context length of the convolution.
:type context_len: int :type context_len: int
:param context_start: The start position of the context. :param context_start: The start position of the context.
:type context_start: int :type context_start: int
...@@ -81,9 +81,9 @@ def encoder(token_emb, ...@@ -81,9 +81,9 @@ def encoder(token_emb,
:type token_emb: LayerOutput :type token_emb: LayerOutput
:param pos_emb: The embedding vector of the input token's position. :param pos_emb: The embedding vector of the input token's position.
:type pos_emb: LayerOutput :type pos_emb: LayerOutput
:param conv_blocks: The scale list of the convolution blocks. And each element of the :param conv_blocks: The scale list of the convolution blocks. Each element of
list contains output dimension and context length of the corresponding the list contains output dimension and context length of
convolution block. the corresponding convolution block.
:type conv_blocks: list of tuple :type conv_blocks: list of tuple
:param num_attention: The total number of the attention modules used in the decoder. :param num_attention: The total number of the attention modules used in the decoder.
:type num_attention: int :type num_attention: int
...@@ -109,9 +109,9 @@ def encoder(token_emb, ...@@ -109,9 +109,9 @@ def encoder(token_emb,
for (size, context_len) in conv_blocks: for (size, context_len) in conv_blocks:
if block_input.size == size: if block_input.size == size:
res = block_input residual = block_input
else: else:
res = paddle.layer.fc( residual = paddle.layer.fc(
input=block_input, input=block_input,
size=size, size=size,
act=paddle.activation.Linear(), act=paddle.activation.Linear(),
...@@ -127,9 +127,10 @@ def encoder(token_emb, ...@@ -127,9 +127,10 @@ def encoder(token_emb,
drop_rate=drop_rate) drop_rate=drop_rate)
with paddle.layer.mixed(size=size) as block_output: with paddle.layer.mixed(size=size) as block_output:
block_output += paddle.layer.identity_projection(res) block_output += paddle.layer.identity_projection(residual)
block_output += paddle.layer.identity_projection(gated_conv) block_output += paddle.layer.identity_projection(gated_conv)
# halve the variance of the sum
block_output = paddle.layer.slope_intercept( block_output = paddle.layer.slope_intercept(
input=block_output, slope=math.sqrt(0.5)) input=block_output, slope=math.sqrt(0.5))
...@@ -143,14 +144,15 @@ def encoder(token_emb, ...@@ -143,14 +144,15 @@ def encoder(token_emb,
param_attr=paddle.attr.Param(learning_rate=1.0 / (2.0 * num_attention)), param_attr=paddle.attr.Param(learning_rate=1.0 / (2.0 * num_attention)),
bias_attr=True) bias_attr=True)
encoded = paddle.layer.addto(input=[encoded_vec, embedding]) encoded_sum = paddle.layer.addto(input=[encoded_vec, embedding])
encoded = paddle.layer.slope_intercept(input=encoded, slope=math.sqrt(0.5)) # halve the variance of the sum
encoded_sum = paddle.layer.slope_intercept(input=encoded_sum, slope=math.sqrt(0.5))
return encoded_vec, encoded return encoded_vec, encoded_sum
def attention(decoder_state, cur_embedding, encoded_vec, encoded): def attention(decoder_state, cur_embedding, encoded_vec, encoded_sum):
""" """
Definition of the attention. Definition of the attention.
...@@ -160,12 +162,12 @@ def attention(decoder_state, cur_embedding, encoded_vec, encoded): ...@@ -160,12 +162,12 @@ def attention(decoder_state, cur_embedding, encoded_vec, encoded):
:type cur_embedding: LayerOutput :type cur_embedding: LayerOutput
:param encoded_vec: The source token encoding. :param encoded_vec: The source token encoding.
:type encoded_vec: LayerOutput :type encoded_vec: LayerOutput
:param encoded: The sum of the source token's encoding and embedding. :param encoded_sum: The sum of the source token's encoding and embedding.
:type encoded: LayerOutput :type encoded_sum: LayerOutput
:return: A context vector. :return: A context vector.
:rtype: LayerOutput :rtype: LayerOutput
""" """
res = decoder_state residual = decoder_state
state_size = decoder_state.size state_size = decoder_state.size
emb_dim = cur_embedding.size emb_dim = cur_embedding.size
...@@ -173,6 +175,7 @@ def attention(decoder_state, cur_embedding, encoded_vec, encoded): ...@@ -173,6 +175,7 @@ def attention(decoder_state, cur_embedding, encoded_vec, encoded):
state_summary += paddle.layer.full_matrix_projection(decoder_state) state_summary += paddle.layer.full_matrix_projection(decoder_state)
state_summary += paddle.layer.identity_projection(cur_embedding) state_summary += paddle.layer.identity_projection(cur_embedding)
# halve the variance of the sum
state_summary = paddle.layer.slope_intercept( state_summary = paddle.layer.slope_intercept(
input=state_summary, slope=math.sqrt(0.5)) input=state_summary, slope=math.sqrt(0.5))
...@@ -186,7 +189,7 @@ def attention(decoder_state, cur_embedding, encoded_vec, encoded): ...@@ -186,7 +189,7 @@ def attention(decoder_state, cur_embedding, encoded_vec, encoded):
act=paddle.activation.SequenceSoftmax(), act=paddle.activation.SequenceSoftmax(),
bias_attr=False) bias_attr=False)
scaled = paddle.layer.scaling(weight=attention_weight, input=encoded) scaled = paddle.layer.scaling(weight=attention_weight, input=encoded_sum)
attended = paddle.layer.pooling( attended = paddle.layer.pooling(
input=scaled, pooling_type=paddle.pooling.Sum()) input=scaled, pooling_type=paddle.pooling.Sum())
...@@ -197,9 +200,9 @@ def attention(decoder_state, cur_embedding, encoded_vec, encoded): ...@@ -197,9 +200,9 @@ def attention(decoder_state, cur_embedding, encoded_vec, encoded):
act=paddle.activation.Linear(), act=paddle.activation.Linear(),
bias_attr=True) bias_attr=True)
# TODO scaled by length attention_result = paddle.layer.addto(input=[attended_proj, residual])
attention_result = paddle.layer.addto(input=[attended_proj, res]) # halve the variance of the sum
attention_result = paddle.layer.slope_intercept( attention_result = paddle.layer.slope_intercept(
input=attention_result, slope=math.sqrt(0.5)) input=attention_result, slope=math.sqrt(0.5))
return attention_result return attention_result
...@@ -208,7 +211,7 @@ def attention(decoder_state, cur_embedding, encoded_vec, encoded): ...@@ -208,7 +211,7 @@ def attention(decoder_state, cur_embedding, encoded_vec, encoded):
def decoder(token_emb, def decoder(token_emb,
pos_emb, pos_emb,
encoded_vec, encoded_vec,
encoded, encoded_sum,
dict_size, dict_size,
conv_blocks=[(256, 3)] * 3, conv_blocks=[(256, 3)] * 3,
drop_rate=0.1): drop_rate=0.1):
...@@ -221,13 +224,13 @@ def decoder(token_emb, ...@@ -221,13 +224,13 @@ def decoder(token_emb,
:type pos_emb: LayerOutput :type pos_emb: LayerOutput
:param encoded_vec: The source token encoding. :param encoded_vec: The source token encoding.
:type encoded_vec: LayerOutput :type encoded_vec: LayerOutput
:param encoded: The sum of the source token's encoding and embedding. :param encoded_sum: The sum of the source token's encoding and embedding.
:type encoded: LayerOutput :type encoded_sum: LayerOutput
:param dict_size: The size of the target dictionary. :param dict_size: The size of the target dictionary.
:type dict_size: int :type dict_size: int
:param conv_blocks: The scale list of the convolution blocks. And each element of the :param conv_blocks: The scale list of the convolution blocks. Each element
list contains output dimension and context length of the corresponding of the list contains output dimension and context length
convolution block. of the corresponding convolution block.
:type conv_blocks: list of tuple :type conv_blocks: list of tuple
:param drop_rate: Dropout rate. :param drop_rate: Dropout rate.
:type drop_rate: float :type drop_rate: float
...@@ -235,24 +238,14 @@ def decoder(token_emb, ...@@ -235,24 +238,14 @@ def decoder(token_emb,
:rtype: LayerOutput :rtype: LayerOutput
""" """
def attention_step(decoder_state, cur_embedding, encoded_vec, encoded): def attention_step(decoder_state, cur_embedding, encoded_vec, encoded_sum):
conditional = attention( conditional = attention(
decoder_state=decoder_state, decoder_state=decoder_state,
cur_embedding=cur_embedding, cur_embedding=cur_embedding,
encoded_vec=encoded_vec, encoded_vec=encoded_vec,
encoded=encoded) encoded_sum=encoded_sum)
return conditional return conditional
def softmax_step(input):
return paddle.layer.fc(
input=input,
size=dict_size,
act=paddle.activation.Softmax(),
param_attr=paddle.attr.Param(
initial_mean=0.,
initial_std=math.sqrt((1.0 - drop_rate) / input.size)),
bias_attr=True, )
embedding = paddle.layer.addto( embedding = paddle.layer.addto(
input=[token_emb, pos_emb], input=[token_emb, pos_emb],
layer_attr=paddle.attr.Extra(drop_rate=drop_rate)) layer_attr=paddle.attr.Extra(drop_rate=drop_rate))
...@@ -269,9 +262,9 @@ def decoder(token_emb, ...@@ -269,9 +262,9 @@ def decoder(token_emb,
for (size, context_len) in conv_blocks: for (size, context_len) in conv_blocks:
if block_input.size == size: if block_input.size == size:
res = block_input residual = block_input
else: else:
res = paddle.layer.fc( residual = paddle.layer.fc(
input=block_input, input=block_input,
size=size, size=size,
act=paddle.activation.Linear(), act=paddle.activation.Linear(),
...@@ -288,13 +281,15 @@ def decoder(token_emb, ...@@ -288,13 +281,15 @@ def decoder(token_emb,
decoder_state, decoder_state,
embedding, embedding,
paddle.layer.StaticInput(input=encoded_vec), paddle.layer.StaticInput(input=encoded_vec),
paddle.layer.StaticInput(input=encoded), paddle.layer.StaticInput(input=encoded_sum),
] ]
conditional = paddle.layer.recurrent_group( conditional = paddle.layer.recurrent_group(
step=attention_step, input=group_inputs) step=attention_step, input=group_inputs)
block_output = paddle.layer.addto(input=[conditional, res]) block_output = paddle.layer.addto(input=[conditional, residual])
# halve the variance of the sum
block_output = paddle.layer.slope_intercept( block_output = paddle.layer.slope_intercept(
input=block_output, slope=math.sqrt(0.5)) input=block_output, slope=math.sqrt(0.5))
...@@ -307,8 +302,14 @@ def decoder(token_emb, ...@@ -307,8 +302,14 @@ def decoder(token_emb,
act=paddle.activation.Linear(), act=paddle.activation.Linear(),
layer_attr=paddle.attr.Extra(drop_rate=drop_rate)) layer_attr=paddle.attr.Extra(drop_rate=drop_rate))
decoder_out = paddle.layer.recurrent_group( decoder_out = paddle.layer.fc(
step=softmax_step, input=[block_output]) input=block_output,
size=dict_size,
act=paddle.activation.Softmax(),
param_attr=paddle.attr.Param(
initial_mean=0.,
initial_std=math.sqrt((1.0 - drop_rate) / block_output.size)),
bias_attr=True)
return decoder_out return decoder_out
...@@ -333,13 +334,13 @@ def conv_seq2seq(src_dict_size, ...@@ -333,13 +334,13 @@ def conv_seq2seq(src_dict_size,
:type pos_size: int :type pos_size: int
:param emb_dim: The dimension of the embedding vector. :param emb_dim: The dimension of the embedding vector.
:type emb_dim: int :type emb_dim: int
:param enc_conv_blocks: The scale list of the encoder's convolution blocks. And each element of :param enc_conv_blocks: The scale list of the encoder's convolution blocks. Each element
the list contains output dimension and context length of the corresponding of the list contains output dimension and context length of the
convolution block. corresponding convolution block.
:type enc_conv_blocks: list of tuple :type enc_conv_blocks: list of tuple
:param dec_conv_blocks: The scale list of the decoder's convolution blocks. And each element of :param dec_conv_blocks: The scale list of the decoder's convolution blocks. Each element
the list contains output dimension and context length of the corresponding of the list contains output dimension and context length of the
convolution block. corresponding convolution block.
:type dec_conv_blocks: list of tuple :type dec_conv_blocks: list of tuple
:param drop_rate: Dropout rate. :param drop_rate: Dropout rate.
:type drop_rate: float :type drop_rate: float
...@@ -368,7 +369,7 @@ def conv_seq2seq(src_dict_size, ...@@ -368,7 +369,7 @@ def conv_seq2seq(src_dict_size,
param_attr=paddle.attr.Param(initial_mean=0., initial_std=0.1)) param_attr=paddle.attr.Param(initial_mean=0., initial_std=0.1))
num_attention = len(dec_conv_blocks) num_attention = len(dec_conv_blocks)
encoded_vec, encoded = encoder( encoded_vec, encoded_sum = encoder(
token_emb=src_emb, token_emb=src_emb,
pos_emb=src_pos_emb, pos_emb=src_pos_emb,
conv_blocks=enc_conv_blocks, conv_blocks=enc_conv_blocks,
...@@ -399,7 +400,7 @@ def conv_seq2seq(src_dict_size, ...@@ -399,7 +400,7 @@ def conv_seq2seq(src_dict_size,
token_emb=trg_emb, token_emb=trg_emb,
pos_emb=trg_pos_emb, pos_emb=trg_pos_emb,
encoded_vec=encoded_vec, encoded_vec=encoded_vec,
encoded=encoded, encoded_sum=encoded_sum,
dict_size=trg_dict_size, dict_size=trg_dict_size,
conv_blocks=dec_conv_blocks, conv_blocks=dec_conv_blocks,
drop_rate=drop_rate) drop_rate=drop_rate)
...@@ -413,4 +414,4 @@ def conv_seq2seq(src_dict_size, ...@@ -413,4 +414,4 @@ def conv_seq2seq(src_dict_size,
cost = paddle.layer.classification_cost( cost = paddle.layer.classification_cost(
input=decoder_out, label=trg_next_word) input=decoder_out, label=trg_next_word)
return cost return cost
\ No newline at end of file
...@@ -18,7 +18,7 @@ def get_reverse_dict(dictionary): ...@@ -18,7 +18,7 @@ def get_reverse_dict(dictionary):
def load_data(data_file, src_dict, trg_dict): def load_data(data_file, src_dict, trg_dict):
UNK_IDX = src_dict['UNK'] UNK_IDX = src_dict['<unk>']
with open(data_file, 'r') as f: with open(data_file, 'r') as f:
for line in f: for line in f:
line_split = line.strip().split('\t') line_split = line.strip().split('\t')
...@@ -34,7 +34,7 @@ def load_data(data_file, src_dict, trg_dict): ...@@ -34,7 +34,7 @@ def load_data(data_file, src_dict, trg_dict):
def data_reader(data_file, src_dict, trg_dict, pos_size, padding_num): def data_reader(data_file, src_dict, trg_dict, pos_size, padding_num):
def reader(): def reader():
UNK_IDX = src_dict['UNK'] UNK_IDX = src_dict['<unk>']
word_padding = trg_dict.__len__() word_padding = trg_dict.__len__()
pos_padding = pos_size pos_padding = pos_size
...@@ -64,4 +64,4 @@ def data_reader(data_file, src_dict, trg_dict, pos_size, padding_num): ...@@ -64,4 +64,4 @@ def data_reader(data_file, src_dict, trg_dict, pos_size, padding_num):
trg_next_word = trg_next_word + [trg_dict['<e>']] * padding_num trg_next_word = trg_next_word + [trg_dict['<e>']] * padding_num
yield src_word, src_word_pos, trg_word, trg_word_pos, trg_next_word yield src_word, src_word_pos, trg_word, trg_word_pos, trg_next_word
return reader return reader
\ No newline at end of file
...@@ -249,4 +249,4 @@ def main(): ...@@ -249,4 +249,4 @@ def main():
if __name__ == '__main__': if __name__ == '__main__':
main() main()
\ No newline at end of file
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册