""" Contains model configuration for external-memory-enhanced seq2seq. The "external memory" refers to two types of memories. - Unbounded memory: i.e. vanilla attention mechanism in Seq2Seq. - Bounded memory: i.e. external memory in NTM. Both types of external memories are exploited to enhance the vanilla Seq2Seq neural machine translation. The implementation primarily follows the paper `Memory-enhanced Decoder for Neural Machine Translation `_, with some minor differences (will be listed in README.md). For details about "external memory", please also refer to `Neural Turing Machines `_. """ import paddle.v2 as paddle from external_memory import ExternalMemory def bidirectional_gru_encoder(input, size, word_vec_dim): """ Bidirectional GRU encoder. """ # token embedding embeddings = paddle.layer.embedding(input=input, size=word_vec_dim) # token-level forward and backard encoding for attentions forward = paddle.networks.simple_gru( input=embeddings, size=size, reverse=False) backward = paddle.networks.simple_gru( input=embeddings, size=size, reverse=True) forward_backward = paddle.layer.concat(input=[forward, backward]) # sequence-level encoding backward_first = paddle.layer.first_seq(input=backward) return forward_backward, backward_first def memory_enhanced_decoder(input, target, initial_state, source_context, size, word_vec_dim, dict_size, is_generating, beam_size): """ GRU sequence decoder enhanced with external memory. The "external memory" refers to two types of memories. - Unbounded memory: i.e. attention mechanism in Seq2Seq. - Bounded memory: i.e. external memory in NTM. Both types of external memories can be implemented with ExternalMemory class, and are both exploited in this enhanced RNN decoder. The vanilla RNN/LSTM/GRU also has a narrow memory mechanism, namely the hidden state vector (or cell state in LSTM) carrying information through a span of sequence time, which is a successful design enriching the model with the capability to "remember" things in the long run. However, such a vector state is somewhat limited to a very narrow memory bandwidth. External memory introduced here could easily increase the memory capacity with linear complexity cost (rather than quadratic for vector state). This enhanced decoder expands its "memory passage" through two ExternalMemory objects: - Bounded memory for handling long-term information exchange within decoder itself. A direct expansion of traditional "vector" state. - Unbounded memory for handling source language's token-wise information. Exactly the attention mechanism over Seq2Seq. Notice that we take the attention mechanism as a particular form of external memory, with read-only memory bank initialized with encoder states, and a read head with content-based addressing (attention). From this view point, we arrive at a better understanding of attention mechanism itself and other external memory, and a concise and unified implementation for them. For more details about external memory, please refer to `Neural Turing Machines `_. For more details about this memory-enhanced decoder, please refer to `Memory-enhanced Decoder for Neural Machine Translation `_. This implementation is highly correlated to this paper, but with minor differences (e.g. put "write" before "read" to bypass a potential bug in V2 APIs. See (`issue `_). """ # prepare initial bounded and unbounded memory bounded_memory_slot_init = paddle.layer.fc( input=paddle.layer.pooling( input=source_context, pooling_type=paddle.pooling.Avg()), size=size, act=paddle.activation.Sigmoid()) bounded_memory_perturbation = paddle.layer.data( name='bounded_memory_perturbation', type=paddle.data_type.dense_vector_sequence(size)) bounded_memory_init = paddle.layer.addto( input=[ paddle.layer.expand( input=bounded_memory_slot_init, expand_as=bounded_memory_perturbation), bounded_memory_perturbation ], act=paddle.activation.Linear()) unbounded_memory_init = source_context # prepare step function for reccurent group def recurrent_decoder_step(cur_embedding): # create hidden state, bounded and unbounded memory. state = paddle.layer.memory( name="gru_decoder", size=size, boot_layer=initial_state) bounded_memory = ExternalMemory( name="bounded_memory", mem_slot_size=size, boot_layer=bounded_memory_init, readonly=False, enable_interpolation=True) unbounded_memory = ExternalMemory( name="unbounded_memory", mem_slot_size=size * 2, boot_layer=unbounded_memory_init, readonly=True, enable_interpolation=False) # write bounded memory bounded_memory.write(state) # read bounded memory bounded_memory_read = bounded_memory.read(state) # prepare key for unbounded memory key_for_unbounded_memory = paddle.layer.fc( input=[bounded_memory_read, cur_embedding], size=size, act=paddle.activation.Tanh(), bias_attr=False) # read unbounded memory (i.e. attention mechanism) context = unbounded_memory.read(key_for_unbounded_memory) # gated recurrent unit gru_inputs = paddle.layer.fc( input=[context, cur_embedding, bounded_memory_read], size=size * 3, act=paddle.activation.Linear(), bias_attr=False) gru_output = paddle.layer.gru_step( name="gru_decoder", input=gru_inputs, output_mem=state, size=size) # step output return paddle.layer.fc( input=[gru_output, context, cur_embedding], size=dict_size, act=paddle.activation.Softmax(), bias_attr=True) if not is_generating: target_embeddings = paddle.layer.embedding( input=input, size=word_vec_dim, param_attr=paddle.attr.ParamAttr(name="_decoder_word_embedding")) decoder_result = paddle.layer.recurrent_group( name="decoder_group", step=recurrent_decoder_step, input=[target_embeddings]) cost = paddle.layer.classification_cost( input=decoder_result, label=target) return cost else: target_embeddings = paddle.layer.GeneratedInputV2( size=dict_size, embedding_name="_decoder_word_embedding", embedding_size=word_vec_dim) beam_gen = paddle.layer.beam_search( name="decoder_group", step=recurrent_decoder_step, input=[target_embeddings], bos_id=0, eos_id=1, beam_size=beam_size, max_length=100) return beam_gen def memory_enhanced_seq2seq(encoder_input, decoder_input, decoder_target, hidden_size, word_vec_dim, dict_size, is_generating, beam_size): """ Seq2Seq Model enhanced with external memory. The "external memory" refers to two types of memories. - Unbounded memory: i.e. attention mechanism in Seq2Seq. - Bounded memory: i.e. external memory in NTM. Both types of external memories can be implemented with ExternalMemory class, and are both exploited in this Seq2Seq model. Please refer to the function comments of memory_enhanced_decoder(...). For more details about external memory, please refer to `Neural Turing Machines `_. For more details about this memory-enhanced Seq2Seq, please refer to `Memory-enhanced Decoder for Neural Machine Translation `_. """ # encoder context_encodings, sequence_encoding = bidirectional_gru_encoder( input=encoder_input, size=hidden_size, word_vec_dim=word_vec_dim) # decoder return memory_enhanced_decoder( input=decoder_input, target=decoder_target, initial_state=sequence_encoding, source_context=context_encodings, size=hidden_size, word_vec_dim=word_vec_dim, dict_size=dict_size, is_generating=is_generating, beam_size=beam_size)