提交 63416ffa 编写于 作者: X Xinghai Sun

Add model configuration for machine translation with external memory.

上级 367e1231
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle.v2 as paddle
import sys
import gzip
dict_size = 30000
word_vec_dim = 512
hidden_size = 1024
batch_size = 5
memory_slot_num = 8
beam_size = 40
infer_data_num = 3
class ExternalMemory(object):
"""
External neural memory class, with differentiable write/read heads.
A simplified Neural Turing Machine (NTM) with only content-based
addressing (including content addressing and interpolation, but excluding
convolutional shift and sharpening). It can serve as an external memory
bank, with differential write/read head controllers responsible for storing
and reading information flow dynamically as the model needs. Here, simple
feedforward neural networks are used as the write/read head controllers.
For more techinical details, please refer to the
`NTM paper <https://arxiv.org/abs/1410.5401>`_.
:param name: Memory name.
:type name: basestring
:param mem_slot_size: Size of memory slot/vector.
:type mem_slot_size: int
:param boot_layer: Boot layer for initializing memory. Sequence layer
with sequence length indicating the number of memory
slots, and size as mem_slot_size.
:type boot_layer: LayerOutput
:param readonly: If true, the memory is read-only, and write function cannot
be called. Default is false.
:type readonly: bool
"""
def __init__(self, name, mem_slot_size, boot_layer, readonly=False):
self.name = name
self.mem_slot_size = mem_slot_size
self.readonly = readonly
self.external_memory = paddle.layer.memory(
name=self.name,
size=self.mem_slot_size,
is_seq=True,
boot_layer=boot_layer)
# set memory to constant when readonly=True
if self.readonly:
self.updated_external_memory = paddle.layer.mixed(
name=self.name,
input=[
paddle.layer.identity_projection(input=self.external_memory)
],
size=self.mem_slot_size)
def __content_addressing__(self, key_vector):
"""
Get head's addressing weight via content-based addressing.
"""
# content-based addressing: a=tanh(W*M + U*key)
key_projection = paddle.layer.fc(
input=key_vector,
size=self.mem_slot_size,
act=paddle.activation.Linear(),
bias_attr=False)
key_proj_expanded = paddle.layer.expand(
input=key_projection, expand_as=self.external_memory)
memory_projection = paddle.layer.fc(
input=self.external_memory,
size=self.mem_slot_size,
act=paddle.activation.Linear(),
bias_attr=False)
merged = paddle.layer.addto(
input=[key_proj_expanded, memory_projection],
act=paddle.activation.Tanh())
# softmax addressing weight: w=softmax(v^T a)
addressing_weight = paddle.layer.fc(
input=merged,
size=1,
act=paddle.activation.SequenceSoftmax(),
bias_attr=False)
return addressing_weight
def __interpolation__(self, key_vector, addressing_weight):
"""
Interpolate between previous and current addressing weights.
"""
# prepare interpolation scalar gate: g=sigmoid(W*key)
gate = paddle.layer.fc(
input=key_vector,
size=1,
act=paddle.activation.Sigmoid(),
bias_attr=False)
# interpolation: w_t = g*w_t+(1-g)*w_{t-1}
last_addressing_weight = paddle.layer.memory(
name=self.name + "_addressing_weight", size=1, is_seq=True)
gated_addressing_weight = paddle.layer.addto(
name=self.name + "_addressing_weight",
input=[
last_addressing_weight,
paddle.layer.scaling(weight=gate, input=addressing_weight),
paddle.layer.mixed(
input=paddle.layer.dotmul_operator(
a=gate, b=last_addressing_weight, scale=-1.0),
size=1)
],
act=paddle.activation.Tanh())
return gated_addressing_weight
def __get_addressing_weight__(self, key_vector):
"""
Get final addressing weight for read/write heads, including content
addressing and interpolation.
"""
# current content-based addressing
addressing_weight = self.__content_addressing__(key_vector)
return addressing_weight
# interpolation with previous addresing weight
return self.__interpolation__(key_vector, addressing_weight)
def write(self, write_key):
"""
Write head for external memory.
:param write_key: Key vector for write head to generate writing
content and addressing signals.
:type write_key: LayerOutput
"""
# check readonly
if self.readonly:
raise ValueError("ExternalMemory with readonly=True cannot write.")
# get addressing weight for write head
write_weight = self.__get_addressing_weight__(write_key)
# prepare add_vector and erase_vector
erase_vector = paddle.layer.fc(
input=write_key,
size=self.mem_slot_size,
act=paddle.activation.Sigmoid(),
bias_attr=False)
add_vector = paddle.layer.fc(
input=write_key,
size=self.mem_slot_size,
act=paddle.activation.Sigmoid(),
bias_attr=False)
erase_vector_expand = paddle.layer.expand(
input=erase_vector, expand_as=self.external_memory)
add_vector_expand = paddle.layer.expand(
input=add_vector, expand_as=self.external_memory)
# prepare scaled add part and erase part
scaled_erase_vector_expand = paddle.layer.scaling(
weight=write_weight, input=erase_vector_expand)
erase_memory_part = paddle.layer.mixed(
input=paddle.layer.dotmul_operator(
a=self.external_memory,
b=scaled_erase_vector_expand,
scale=-1.0))
add_memory_part = paddle.layer.scaling(
weight=write_weight, input=add_vector_expand)
# update external memory
self.updated_external_memory = paddle.layer.addto(
input=[self.external_memory, add_memory_part, erase_memory_part],
name=self.name)
def read(self, read_key):
"""
Read head for external memory.
:param write_key: Key vector for read head to generate addressing
signals.
:type write_key: LayerOutput
:return: Content read from external memory.
:rtype: LayerOutput
"""
# get addressing weight for write head
read_weight = self.__get_addressing_weight__(read_key)
# read content from external memory
scaled = paddle.layer.scaling(
weight=read_weight, input=self.updated_external_memory)
return paddle.layer.pooling(
input=scaled, pooling_type=paddle.pooling.Sum())
def bidirectional_gru_encoder(input, size, word_vec_dim):
"""
Bidirectional GRU encoder.
"""
# token embedding
embeddings = paddle.layer.embedding(
input=input,
size=word_vec_dim,
param_attr=paddle.attr.ParamAttr(name='_encoder_word_embedding'))
# token-level forward and backard encoding for attentions
forward = paddle.networks.simple_gru(
input=embeddings, size=size, reverse=False)
backward = paddle.networks.simple_gru(
input=embeddings, size=size, reverse=True)
merged = paddle.layer.concat(input=[forward, backward])
# sequence-level encoding
backward_first = paddle.layer.first_seq(input=backward)
return merged, backward_first
def memory_enhanced_decoder(input, target, initial_state, source_context, size,
word_vec_dim, dict_size, is_generating, beam_size):
"""
Memory enhanced GRU decoder.
The "external memory" refers to two types of memories.
- Unbounded memory: i.e. vanilla attention mechanism.
- Bounded memory: i.e. external memory in NTM.
Both types of external memories can be implemented with
ExternalMemory class, and are both included in this enhanced seq2seq model.
Here, the bounded memory takes the place of the "state" vector in RNNs. The
state vector in RNNs is a very successfull design enriching the model with
capability to "remember" things in the long run (across multiple sequence
steps). However, such a vector state is somewhat limited to very small
memory bandwith. A bounded memory introduced here could easily increase the
memory capacity under linear complexity cost (rather than quadratic
with vector state). Besides, attention mechasim (with unbounded memory) also
serves as a exteranl memory bank encoding source input information.
Notice that we take the attention mechanism as a special form of external
memory, with readonly memory bank initialized with encoder states, and a
content-based addressing read head responsable for generating attentional
context. From this view point, we could have a better understanding of
attention mechanism and other types of external memory, and it also enable a
concise and unified implementation for them.
For more techinical details about external memory, please refer to
`Neural Turing Machines <https://arxiv.org/abs/1410.5401>`_.
For more techinical details about this memory-enhanced decoder, please
refer to `Memory-enhanced Decoder for Neural Machine Translation
<https://arxiv.org/abs/1606.02003>`_. This implementation is highly
correlated to this paper with minor differences.
Also, we reversed the read-write order, for skipping the potential problems
in PaddlePaddle V2 APIs.
See `issue <https://github.com/PaddlePaddle/Paddle/issues/2061>`_.
"""
# prepare initial bounded and unbounded memory
bounded_memory_slot_init = paddle.layer.fc(
input=paddle.layer.pooling(
input=source_context, pooling_type=paddle.pooling.Avg()),
size=size,
act=paddle.activation.Sigmoid())
bounded_memory_init = paddle.layer.expand(
input=bounded_memory_slot_init,
expand_as=paddle.layer.data(
name='bounded_memory_template',
type=paddle.data_type.integer_value_sequence(0)))
unbounded_memory_init = source_context
# prepare step function for reccurent group
def recurrent_decoder_step(cur_embedding):
# create hidden state, bounded and unbounded memory.
state = paddle.layer.memory(
name="gru_decoder", size=size, boot_layer=initial_state)
bounded_memory = ExternalMemory(
name="bounded_memory",
mem_slot_size=size,
boot_layer=bounded_memory_init,
readonly=False)
unbounded_memory = ExternalMemory(
name="unbounded_memory",
mem_slot_size=size * 2,
boot_layer=unbounded_memory_init,
readonly=True)
# write bounded memory
bounded_memory.write(state)
# read bounded memory
bounded_memory_read = bounded_memory.read(state)
# prepare key for unbounded memory
key_for_unbounded_memory = paddle.layer.fc(
input=[bounded_memory_read, cur_embedding],
size=size,
act=paddle.activation.Tanh(),
bias_attr=False)
# read unbounded memory (i.e. attention mechanism)
context = unbounded_memory.read(key_for_unbounded_memory)
# gated recurrent unit
gru_inputs = paddle.layer.fc(
input=[context, cur_embedding, bounded_memory_read],
size=size * 3,
act=paddle.activation.Linear(),
bias_attr=False)
gru_output = paddle.layer.gru_step(
name="gru_decoder", input=gru_inputs, output_mem=state, size=size)
# step output
return paddle.layer.fc(
input=[gru_output, context, cur_embedding],
size=dict_size,
act=paddle.activation.Softmax(),
bias_attr=True)
if not is_generating:
target_embeddings = paddle.layer.embedding(
input=input,
size=word_vec_dim,
param_attr=paddle.attr.ParamAttr(name="_decoder_word_embedding"))
decoder_result = paddle.layer.recurrent_group(
name="decoder_group",
step=recurrent_decoder_step,
input=[target_embeddings])
cost = paddle.layer.classification_cost(
input=decoder_result, label=target)
return cost
else:
target_embeddings = paddle.layer.GeneratedInputV2(
size=dict_size,
embedding_name="_decoder_word_embedding",
embedding_size=word_vec_dim)
beam_gen = paddle.layer.beam_search(
name="decoder_group",
step=recurrent_decoder_step,
input=[target_embeddings],
bos_id=0,
eos_id=1,
beam_size=beam_size,
max_length=100)
return beam_gen
def memory_enhanced_seq2seq(encoder_input, decoder_input, decoder_target,
hidden_size, word_vec_dim, dict_size, is_generating,
beam_size):
"""
Seq2Seq Model enhanced with external memory.
The "external memory" refers to two types of memories.
- Unbounded memory: i.e. vanilla attention mechanism.
- Bounded memory: i.e. external memory in NTM.
Both types of external memories can be implemented with
ExternalMemory class, and are both included in this enhanced seq2seq model.
Here, the bounded memory takes the place of the "state" vector in RNNs. The
state vector in RNNs is a very successfull design enriching the model with
capability to "remember" things in the long run (across multiple sequence
steps). However, such a vector state is somewhat limited to very small
memory bandwith. A bounded memory introduced here could easily increase the
memory capacity under linear complexity cost (rather than quadratic
with vector state). Besides, attention mechasim (with unbounded memory) also
serves as a exteranl memory bank encoding source input information.
Notice that we take the attention mechanism as a special form of external
memory, with readonly memory bank initialized with encoder states, and a
content-based addressing read head responsable for generating attentional
context. From this view point, we could have a better understanding of
attention mechanism and other types of external memory, and it also enable a
concise and unified implementation for them.
For more techinical details about external memory, please refer to
`Neural Turing Machines <https://arxiv.org/abs/1410.5401>`_.
For more techinical details about this memory-enhanced decoder, please
refer to `Memory-enhanced Decoder for Neural Machine Translation
<https://arxiv.org/abs/1606.02003>`_. This implementation is highly
correlated to this paper with minor differences.
Also, we reversed the read-write order, for skipping the potential problems
in PaddlePaddle V2 APIs.
See `issue <https://github.com/PaddlePaddle/Paddle/issues/2061>`_.
"""
# encoder
context_encodings, sequence_encoding = bidirectional_gru_encoder(
input=encoder_input, size=hidden_size, word_vec_dim=word_vec_dim)
# decoder
return memory_enhanced_decoder(
input=decoder_input,
target=decoder_target,
initial_state=sequence_encoding,
source_context=context_encodings,
size=hidden_size,
word_vec_dim=word_vec_dim,
dict_size=dict_size,
is_generating=is_generating,
beam_size=beam_size)
def parse_beam_result(beam_result, dictionary):
"""
Beam result parser.
"""
sentence_list = []
sentence = []
for word in beam_result[1]:
if word != -1:
sentence.append(word)
else:
sentence_list.append(
' '.join([dictionary.get(word) for word in sentence[1:]]))
sentence = []
beam_probs = beam_result[0]
beam_size = len(beam_probs[0])
beam_sentences = [
sentence_list[i:i + beam_size]
for i in range(0, len(sentence_list), beam_size)
]
return beam_probs, beam_sentences
def reader_append_wrapper(reader, append_tuple):
"""
Data reader wrapper for appending extra data to exisiting reader.
"""
def new_reader():
for ins in reader():
yield ins + append_tuple
return new_reader
def train(num_passes):
"""
For training.
"""
# create network config
source_words = paddle.layer.data(
name="source_words",
type=paddle.data_type.integer_value_sequence(dict_size))
target_words = paddle.layer.data(
name="target_words",
type=paddle.data_type.integer_value_sequence(dict_size))
target_next_words = paddle.layer.data(
name='target_next_words',
type=paddle.data_type.integer_value_sequence(dict_size))
cost = memory_enhanced_seq2seq(
encoder_input=source_words,
decoder_input=target_words,
decoder_target=target_next_words,
hidden_size=hidden_size,
word_vec_dim=word_vec_dim,
dict_size=dict_size,
is_generating=False,
beam_size=beam_size)
# create parameters and optimizer
parameters = paddle.parameters.create(cost)
optimizer = paddle.optimizer.Adam(
learning_rate=5e-5,
gradient_clipping_threshold=5,
regularization=paddle.optimizer.L2Regularization(rate=8e-4))
trainer = paddle.trainer.SGD(
cost=cost, parameters=parameters, update_equation=optimizer)
# create data readers
feeding = {
"source_words": 0,
"target_words": 1,
"target_next_words": 2,
"bounded_memory_template": 3
}
train_append_reader = reader_append_wrapper(
reader=paddle.dataset.wmt14.train(dict_size),
append_tuple=([0] * memory_slot_num, ))
train_batch_reader = paddle.batch(
reader=paddle.reader.shuffle(reader=train_append_reader, buf_size=8192),
batch_size=batch_size)
test_append_reader = reader_append_wrapper(
reader=paddle.dataset.wmt14.test(dict_size),
append_tuple=([0] * memory_slot_num, ))
test_batch_reader = paddle.batch(
reader=paddle.reader.shuffle(reader=test_append_reader, buf_size=8192),
batch_size=batch_size)
# create event handler
def event_handler(event):
if isinstance(event, paddle.event.EndIteration):
if event.batch_id % 10 == 0:
print "Pass: %d, Batch: %d, TrainCost: %f, %s" % (
event.pass_id, event.batch_id, event.cost, event.metrics)
with gzip.open("params.tar.gz", 'w') as f:
parameters.to_tar(f)
else:
sys.stdout.write('.')
sys.stdout.flush()
if isinstance(event, paddle.event.EndPass):
result = trainer.test(reader=test_batch_reader, feeding=feeding)
print "Pass: %d, TestCost: %f, %s" % (event.pass_id, event.cost,
result.metrics)
with gzip.open("params.tar.gz", 'w') as f:
parameters.to_tar(f)
# run train
trainer.train(
reader=train_batch_reader,
event_handler=event_handler,
num_passes=num_passes,
feeding=feeding)
def infer():
"""
For inferencing.
"""
# create network config
source_words = paddle.layer.data(
name="source_words",
type=paddle.data_type.integer_value_sequence(dict_size))
beam_gen = seq2seq(
encoder_input=source_words,
decoder_input=None,
decoder_target=None,
hidden_size=hidden_size,
word_vec_dim=word_vec_dim,
dict_size=dict_size,
is_generating=True,
beam_size=beam_size)
# load parameters
parameters = paddle.parameters.Parameters.from_tar(
gzip.open("params.tar.gz"))
# prepare infer data
infer_data = []
test_append_reader = reader_append_wrapper(
reader=paddle.dataset.wmt14.test(dict_size),
append_tuple=([0] * memory_slot_num, ))
for i, item in enumerate(test_append_reader()):
if i < infer_data_num:
infer_data.append((item[0], item[3], ))
# run inference
beam_result = paddle.infer(
output_layer=beam_gen,
parameters=parameters,
input=infer_data,
field=['prob', 'id'])
# parse beam result and print
source_dict, target_dict = paddle.dataset.wmt14.get_dict(dict_size)
beam_probs, beam_sentences = parse_beam_result(beam_result, target_dict)
for i in xrange(infer_data_num):
print "\n*******************************************************\n"
print "src:", ' '.join(
[source_dict.get(word) for word in infer_data[i][0]]), "\n"
for j in xrange(beam_size):
print "prob = %f : %s" % (beam_probs[i][j], beam_sentences[i][j])
def main():
paddle.init(use_gpu=False, trainer_count=1)
train(num_passes=1)
infer()
if __name__ == '__main__':
main()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册