提交 717ccf5e 编写于 作者: C Cao Ying 提交者: GitHub

Merge pull request #36 from xinghai-sun/mt_with_external_memory

Add model configuration for machine translation with external memory.
此差异已折叠。
"""
Contains data utilities.
"""
def reader_append_wrapper(reader, append_tuple):
"""
Data reader wrapper for appending extra data to exisiting reader.
"""
def new_reader():
for ins in reader():
yield ins + append_tuple
return new_reader
"""
External neural memory class.
"""
import paddle.v2 as paddle
class ExternalMemory(object):
"""External neural memory class.
A simplified Neural Turing Machines (NTM) with only content-based
addressing (including content addressing and interpolation, but excluding
convolutional shift and sharpening). It serves as an external differential
memory bank, with differential write/read head controllers to store
and read information dynamically. Simple feedforward networks are
used as the write/read head controllers.
The ExternalMemory class could be utilized by many neural network structures
to easily expand their memory bandwidth and accomplish a long-term memory
handling. Besides, some existing mechanism can be realized directly with
the ExternalMemory class, e.g. the attention mechanism in Seq2Seq (i.e. an
unbounded external memory).
Besides, the ExternalMemory class must be used together with
paddle.layer.recurrent_group (within its step function). It can never be
used in a standalone manner.
For more details, please refer to
`Neural Turing Machines <https://arxiv.org/abs/1410.5401>`_.
:param name: Memory name.
:type name: basestring
:param mem_slot_size: Size of memory slot/vector.
:type mem_slot_size: int
:param boot_layer: Boot layer for initializing the external memory. The
sequence layer has sequence length indicating the number
of memory slots, and size as memory slot size.
:type boot_layer: LayerOutput
:param readonly: If true, the memory is read-only, and write function cannot
be called. Default is false.
:type readonly: bool
:param enable_interpolation: If set true, the read/write addressing weights
will be interpolated with the weights in the
last step, with the affine coefficients being
a learnable gate function.
:type enable_interpolation: bool
"""
def __init__(self,
name,
mem_slot_size,
boot_layer,
readonly=False,
enable_interpolation=True):
self.name = name
self.mem_slot_size = mem_slot_size
self.readonly = readonly
self.enable_interpolation = enable_interpolation
self.external_memory = paddle.layer.memory(
name=self.name, size=self.mem_slot_size, boot_layer=boot_layer)
# prepare a constant (zero) intializer for addressing weights
self.zero_addressing_init = paddle.layer.slope_intercept(
input=paddle.layer.fc(input=boot_layer, size=1),
slope=0.0,
intercept=0.0)
# set memory to constant when readonly=True
if self.readonly:
self.updated_external_memory = paddle.layer.mixed(
name=self.name,
input=[
paddle.layer.identity_projection(input=self.external_memory)
],
size=self.mem_slot_size)
def _content_addressing(self, key_vector):
"""Get write/read head's addressing weights via content-based addressing.
"""
# content-based addressing: a=tanh(W*M + U*key)
key_projection = paddle.layer.fc(
input=key_vector,
size=self.mem_slot_size,
act=paddle.activation.Linear(),
bias_attr=False)
key_proj_expanded = paddle.layer.expand(
input=key_projection, expand_as=self.external_memory)
memory_projection = paddle.layer.fc(
input=self.external_memory,
size=self.mem_slot_size,
act=paddle.activation.Linear(),
bias_attr=False)
merged_projection = paddle.layer.addto(
input=[key_proj_expanded, memory_projection],
act=paddle.activation.Tanh())
# softmax addressing weight: w=softmax(v^T a)
addressing_weight = paddle.layer.fc(
input=merged_projection,
size=1,
act=paddle.activation.SequenceSoftmax(),
bias_attr=False)
return addressing_weight
def _interpolation(self, head_name, key_vector, addressing_weight):
"""Interpolate between previous and current addressing weights.
"""
# prepare interpolation scalar gate: g=sigmoid(W*key)
gate = paddle.layer.fc(
input=key_vector,
size=1,
act=paddle.activation.Sigmoid(),
bias_attr=False)
# interpolation: w_t = g*w_t+(1-g)*w_{t-1}
last_addressing_weight = paddle.layer.memory(
name=self.name + "_addressing_weight_" + head_name,
size=1,
boot_layer=self.zero_addressing_init)
interpolated_weight = paddle.layer.interpolation(
name=self.name + "_addressing_weight_" + head_name,
input=[addressing_weight, addressing_weight],
weight=paddle.layer.expand(input=gate, expand_as=addressing_weight))
return interpolated_weight
def _get_addressing_weight(self, head_name, key_vector):
"""Get final addressing weights for read/write heads, including content
addressing and interpolation.
"""
# current content-based addressing
addressing_weight = self._content_addressing(key_vector)
# interpolation with previous addresing weight
if self.enable_interpolation:
return self._interpolation(head_name, key_vector, addressing_weight)
else:
return addressing_weight
def write(self, write_key):
"""Write onto the external memory.
It cannot be called if "readonly" set True.
:param write_key: Key vector for write heads to generate writing
content and addressing signals.
:type write_key: LayerOutput
"""
# check readonly
if self.readonly:
raise ValueError("ExternalMemory with readonly=True cannot write.")
# get addressing weight for write head
write_weight = self._get_addressing_weight("write_head", write_key)
# prepare add_vector and erase_vector
erase_vector = paddle.layer.fc(
input=write_key,
size=self.mem_slot_size,
act=paddle.activation.Sigmoid(),
bias_attr=False)
add_vector = paddle.layer.fc(
input=write_key,
size=self.mem_slot_size,
act=paddle.activation.Sigmoid(),
bias_attr=False)
erase_vector_expand = paddle.layer.expand(
input=erase_vector, expand_as=self.external_memory)
add_vector_expand = paddle.layer.expand(
input=add_vector, expand_as=self.external_memory)
# prepare scaled add part and erase part
scaled_erase_vector_expand = paddle.layer.scaling(
weight=write_weight, input=erase_vector_expand)
erase_memory_part = paddle.layer.mixed(
input=paddle.layer.dotmul_operator(
a=self.external_memory,
b=scaled_erase_vector_expand,
scale=-1.0))
add_memory_part = paddle.layer.scaling(
weight=write_weight, input=add_vector_expand)
# update external memory
self.updated_external_memory = paddle.layer.addto(
input=[self.external_memory, add_memory_part, erase_memory_part],
name=self.name)
def read(self, read_key):
"""Read from the external memory.
:param write_key: Key vector for read head to generate addressing
signals.
:type write_key: LayerOutput
:return: Content (vector) read from external memory.
:rtype: LayerOutput
"""
# get addressing weight for write head
read_weight = self._get_addressing_weight("read_head", read_key)
# read content from external memory
scaled = paddle.layer.scaling(
weight=read_weight, input=self.updated_external_memory)
return paddle.layer.pooling(
input=scaled, pooling_type=paddle.pooling.Sum())
"""
Contains infering script for machine translation with external memory.
"""
import distutils.util
import argparse
import gzip
import paddle.v2 as paddle
from external_memory import ExternalMemory
from model import memory_enhanced_seq2seq
from data_utils import reader_append_wrapper
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
"--dict_size",
default=30000,
type=int,
help="Vocabulary size. (default: %(default)s)")
parser.add_argument(
"--word_vec_dim",
default=512,
type=int,
help="Word embedding size. (default: %(default)s)")
parser.add_argument(
"--hidden_size",
default=1024,
type=int,
help="Hidden cell number in RNN. (default: %(default)s)")
parser.add_argument(
"--memory_slot_num",
default=8,
type=int,
help="External memory slot number. (default: %(default)s)")
parser.add_argument(
"--beam_size",
default=3,
type=int,
help="Beam search width. (default: %(default)s)")
parser.add_argument(
"--use_gpu",
default=False,
type=distutils.util.strtobool,
help="Use gpu or not. (default: %(default)s)")
parser.add_argument(
"--trainer_count",
default=1,
type=int,
help="Trainer number. (default: %(default)s)")
parser.add_argument(
"--batch_size",
default=5,
type=int,
help="Batch size. (default: %(default)s)")
parser.add_argument(
"--infer_data_num",
default=3,
type=int,
help="Instance num to infer. (default: %(default)s)")
parser.add_argument(
"--model_filepath",
default="checkpoints/params.latest.tar.gz",
type=str,
help="Model filepath. (default: %(default)s)")
parser.add_argument(
"--memory_perturb_stddev",
default=0.1,
type=float,
help="Memory perturb stddev for memory initialization."
"(default: %(default)s)")
args = parser.parse_args()
def parse_beam_search_result(beam_result, dictionary):
"""
Beam search result parser.
"""
sentence_list = []
sentence = []
for word in beam_result[1]:
if word != -1:
sentence.append(word)
else:
sentence_list.append(
' '.join([dictionary.get(word) for word in sentence[1:]]))
sentence = []
beam_probs = beam_result[0]
beam_size = len(beam_probs[0])
beam_sentences = [
sentence_list[i:i + beam_size]
for i in range(0, len(sentence_list), beam_size)
]
return beam_probs, beam_sentences
def infer():
"""
For inferencing.
"""
# create network config
source_words = paddle.layer.data(
name="source_words",
type=paddle.data_type.integer_value_sequence(args.dict_size))
beam_gen = memory_enhanced_seq2seq(
encoder_input=source_words,
decoder_input=None,
decoder_target=None,
hidden_size=args.hidden_size,
word_vec_dim=args.word_vec_dim,
dict_size=args.dict_size,
is_generating=True,
beam_size=args.beam_size)
# load parameters
parameters = paddle.parameters.Parameters.from_tar(
gzip.open(args.model_filepath))
# prepare infer data
infer_data = []
random.seed(0) # for keeping consitancy for multiple runs
bounded_memory_perturbation = [[
random.gauss(0, memory_perturb_stddev) for i in xrange(args.hidden_size)
] for j in xrange(args.memory_slot_num)]
test_append_reader = reader_append_wrapper(
reader=paddle.dataset.wmt14.test(dict_size),
append_tuple=(bounded_memory_perturbation, ))
for i, item in enumerate(test_append_reader()):
if i < args.infer_data_num:
infer_data.append((item[0], item[3], ))
# run inference
beam_result = paddle.infer(
output_layer=beam_gen,
parameters=parameters,
input=infer_data,
field=['prob', 'id'])
# parse beam result and print
source_dict, target_dict = paddle.dataset.wmt14.get_dict(dict_size)
beam_probs, beam_sentences = parse_beam_search_result(beam_result,
target_dict)
for i in xrange(args.infer_data_num):
print "\n***************************************************\n"
print "src:", ' '.join(
[source_dict.get(word) for word in infer_data[i][0]]), "\n"
for j in xrange(args.beam_size):
print "prob = %f : %s" % (beam_probs[i][j], beam_sentences[i][j])
def main():
paddle.init(use_gpu=False, trainer_count=1)
infer()
if __name__ == '__main__':
main()
"""
Contains model configuration for external-memory-enhanced seq2seq.
The "external memory" refers to two types of memories.
- Unbounded memory: i.e. vanilla attention mechanism in Seq2Seq.
- Bounded memory: i.e. external memory in NTM.
Both types of external memories are exploited to enhance the vanilla
Seq2Seq neural machine translation.
The implementation primarily follows the paper
`Memory-enhanced Decoder for Neural Machine Translation
<https://arxiv.org/abs/1606.02003>`_,
with some minor differences (will be listed in README.md).
For details about "external memory", please also refer to
`Neural Turing Machines <https://arxiv.org/abs/1410.5401>`_.
"""
import paddle.v2 as paddle
from external_memory import ExternalMemory
def bidirectional_gru_encoder(input, size, word_vec_dim):
"""Bidirectional GRU encoder.
:params size: Hidden cell number in decoder rnn.
:type size: int
:params word_vec_dim: Word embedding size.
:type word_vec_dim: int
:return: Tuple of 1. concatenated forward and backward hidden sequence.
2. last state of backward rnn.
:rtype: tuple of LayerOutput
"""
# token embedding
embeddings = paddle.layer.embedding(input=input, size=word_vec_dim)
# token-level forward and backard encoding for attentions
forward = paddle.networks.simple_gru(
input=embeddings, size=size, reverse=False)
backward = paddle.networks.simple_gru(
input=embeddings, size=size, reverse=True)
forward_backward = paddle.layer.concat(input=[forward, backward])
# sequence-level encoding
backward_first = paddle.layer.first_seq(input=backward)
return forward_backward, backward_first
def memory_enhanced_decoder(input, target, initial_state, source_context, size,
word_vec_dim, dict_size, is_generating, beam_size):
"""GRU sequence decoder enhanced with external memory.
The "external memory" refers to two types of memories.
- Unbounded memory: i.e. attention mechanism in Seq2Seq.
- Bounded memory: i.e. external memory in NTM.
Both types of external memories can be implemented with
ExternalMemory class, and are both exploited in this enhanced RNN decoder.
The vanilla RNN/LSTM/GRU also has a narrow memory mechanism, namely the
hidden state vector (or cell state in LSTM) carrying information through
a span of sequence time, which is a successful design enriching the model
with the capability to "remember" things in the long run. However, such a
vector state is somewhat limited to a very narrow memory bandwidth. External
memory introduced here could easily increase the memory capacity with linear
complexity cost (rather than quadratic for vector state).
This enhanced decoder expands its "memory passage" through two
ExternalMemory objects:
- Bounded memory for handling long-term information exchange within decoder
itself. A direct expansion of traditional "vector" state.
- Unbounded memory for handling source language's token-wise information.
Exactly the attention mechanism over Seq2Seq.
Notice that we take the attention mechanism as a particular form of external
memory, with read-only memory bank initialized with encoder states, and a
read head with content-based addressing (attention). From this view point,
we arrive at a better understanding of attention mechanism itself and other
external memory, and a concise and unified implementation for them.
For more details about external memory, please refer to
`Neural Turing Machines <https://arxiv.org/abs/1410.5401>`_.
For more details about this memory-enhanced decoder, please
refer to `Memory-enhanced Decoder for Neural Machine Translation
<https://arxiv.org/abs/1606.02003>`_. This implementation is highly
correlated to this paper, but with minor differences (e.g. put "write"
before "read" to bypass a potential bug in V2 APIs. See
(`issue <https://github.com/PaddlePaddle/Paddle/issues/2061>`_).
:params input: Decoder input.
:type input: LayerOutput
:params target: Decoder target.
:type target: LayerOutput
:params initial_state: Initial hidden state.
:type initial_state: LayerOutput
:params source_context: Group of context hidden states for each token in the
source sentence, for attention mechanisim.
:type source_context: LayerOutput
:params size: Hidden cell number in decoder rnn.
:type size: int
:params word_vec_dim: Word embedding size.
:type word_vec_dim: int
:param dict_size: Vocabulary size.
:type dict_size: int
:params is_generating: Whether for beam search inferencing (True) or
for training (False).
:type is_generating: bool
:params beam_size: Beam search width.
:type beam_size: int
:return: Cost layer if is_generating=False; Beam search layer if
is_generating = True.
:rtype: LayerOutput
"""
# prepare initial bounded and unbounded memory
bounded_memory_slot_init = paddle.layer.fc(
input=paddle.layer.pooling(
input=source_context, pooling_type=paddle.pooling.Avg()),
size=size,
act=paddle.activation.Sigmoid())
bounded_memory_perturbation = paddle.layer.data(
name='bounded_memory_perturbation',
type=paddle.data_type.dense_vector_sequence(size))
bounded_memory_init = paddle.layer.addto(
input=[
paddle.layer.expand(
input=bounded_memory_slot_init,
expand_as=bounded_memory_perturbation),
bounded_memory_perturbation
],
act=paddle.activation.Linear())
unbounded_memory_init = source_context
# prepare step function for reccurent group
def recurrent_decoder_step(cur_embedding):
# create hidden state, bounded and unbounded memory.
state = paddle.layer.memory(
name="gru_decoder", size=size, boot_layer=initial_state)
bounded_memory = ExternalMemory(
name="bounded_memory",
mem_slot_size=size,
boot_layer=bounded_memory_init,
readonly=False,
enable_interpolation=True)
unbounded_memory = ExternalMemory(
name="unbounded_memory",
mem_slot_size=size * 2,
boot_layer=unbounded_memory_init,
readonly=True,
enable_interpolation=False)
# write bounded memory
bounded_memory.write(state)
# read bounded memory
bounded_memory_read = bounded_memory.read(state)
# prepare key for unbounded memory
key_for_unbounded_memory = paddle.layer.fc(
input=[bounded_memory_read, cur_embedding],
size=size,
act=paddle.activation.Tanh(),
bias_attr=False)
# read unbounded memory (i.e. attention mechanism)
context = unbounded_memory.read(key_for_unbounded_memory)
# gated recurrent unit
gru_inputs = paddle.layer.fc(
input=[context, cur_embedding, bounded_memory_read],
size=size * 3,
act=paddle.activation.Linear(),
bias_attr=False)
gru_output = paddle.layer.gru_step(
name="gru_decoder", input=gru_inputs, output_mem=state, size=size)
# step output
return paddle.layer.fc(
input=[gru_output, context, cur_embedding],
size=dict_size,
act=paddle.activation.Softmax(),
bias_attr=True)
if not is_generating:
target_embeddings = paddle.layer.embedding(
input=input,
size=word_vec_dim,
param_attr=paddle.attr.ParamAttr(name="_decoder_word_embedding"))
decoder_result = paddle.layer.recurrent_group(
name="decoder_group",
step=recurrent_decoder_step,
input=[target_embeddings])
cost = paddle.layer.classification_cost(
input=decoder_result, label=target)
return cost
else:
target_embeddings = paddle.layer.GeneratedInput(
size=dict_size,
embedding_name="_decoder_word_embedding",
embedding_size=word_vec_dim)
beam_gen = paddle.layer.beam_search(
name="decoder_group",
step=recurrent_decoder_step,
input=[target_embeddings],
bos_id=0,
eos_id=1,
beam_size=beam_size,
max_length=100)
return beam_gen
def memory_enhanced_seq2seq(encoder_input, decoder_input, decoder_target,
hidden_size, word_vec_dim, dict_size, is_generating,
beam_size):
"""Seq2Seq Model enhanced with external memory.
The "external memory" refers to two types of memories.
- Unbounded memory: i.e. attention mechanism in Seq2Seq.
- Bounded memory: i.e. external memory in NTM.
Both types of external memories can be implemented with
ExternalMemory class, and are both exploited in this Seq2Seq model.
Please refer to the function comments of memory_enhanced_decoder(...).
For more details about external memory, please refer to
`Neural Turing Machines <https://arxiv.org/abs/1410.5401>`_.
For more details about this memory-enhanced Seq2Seq, please
refer to `Memory-enhanced Decoder for Neural Machine Translation
<https://arxiv.org/abs/1606.02003>`_.
:params encoder_input: Encoder input.
:type encoder_input: LayerOutput
:params decoder_input: Decoder input.
:type decoder_input: LayerOutput
:params decoder_target: Decoder target.
:type decoder_target: LayerOutput
:params hidden_size: Hidden cell number, both in encoder and decoder rnn.
:type hidden_size: int
:params word_vec_dim: Word embedding size.
:type word_vec_dim: int
:param dict_size: Vocabulary size.
:type dict_size: int
:params is_generating: Whether for beam search inferencing (True) or
for training (False).
:type is_generating: bool
:params beam_size: Beam search width.
:type beam_size: int
:return: Cost layer if is_generating=False; Beam search layer if
is_generating = True.
:rtype: LayerOutput
"""
# encoder
context_encodings, sequence_encoding = bidirectional_gru_encoder(
input=encoder_input, size=hidden_size, word_vec_dim=word_vec_dim)
# decoder
return memory_enhanced_decoder(
input=decoder_input,
target=decoder_target,
initial_state=sequence_encoding,
source_context=context_encodings,
size=hidden_size,
word_vec_dim=word_vec_dim,
dict_size=dict_size,
is_generating=is_generating,
beam_size=beam_size)
"""
Contains training script for machine translation with external memory.
"""
import argparse
import sys
import gzip
import distutils.util
import random
import paddle.v2 as paddle
from external_memory import ExternalMemory
from model import memory_enhanced_seq2seq
from data_utils import reader_append_wrapper
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
"--dict_size",
default=30000,
type=int,
help="Vocabulary size. (default: %(default)s)")
parser.add_argument(
"--word_vec_dim",
default=512,
type=int,
help="Word embedding size. (default: %(default)s)")
parser.add_argument(
"--hidden_size",
default=1024,
type=int,
help="Hidden cell number in RNN. (default: %(default)s)")
parser.add_argument(
"--memory_slot_num",
default=8,
type=int,
help="External memory slot number. (default: %(default)s)")
parser.add_argument(
"--use_gpu",
default=False,
type=distutils.util.strtobool,
help="Use gpu or not. (default: %(default)s)")
parser.add_argument(
"--trainer_count",
default=1,
type=int,
help="Trainer number. (default: %(default)s)")
parser.add_argument(
"--num_passes",
default=100,
type=int,
help="Training epochs. (default: %(default)s)")
parser.add_argument(
"--batch_size",
default=5,
type=int,
help="Batch size. (default: %(default)s)")
parser.add_argument(
"--memory_perturb_stddev",
default=0.1,
type=float,
help="Memory perturb stddev for memory initialization."
"(default: %(default)s)")
args = parser.parse_args()
def train():
"""
For training.
"""
# create optimizer
optimizer = paddle.optimizer.Adam(
learning_rate=5e-5,
gradient_clipping_threshold=5,
regularization=paddle.optimizer.L2Regularization(rate=8e-4))
# create network config
source_words = paddle.layer.data(
name="source_words",
type=paddle.data_type.integer_value_sequence(args.dict_size))
target_words = paddle.layer.data(
name="target_words",
type=paddle.data_type.integer_value_sequence(args.dict_size))
target_next_words = paddle.layer.data(
name='target_next_words',
type=paddle.data_type.integer_value_sequence(args.dict_size))
cost = memory_enhanced_seq2seq(
encoder_input=source_words,
decoder_input=target_words,
decoder_target=target_next_words,
hidden_size=args.hidden_size,
word_vec_dim=args.word_vec_dim,
dict_size=args.dict_size,
is_generating=False,
beam_size=None)
# create parameters and trainer
parameters = paddle.parameters.create(cost)
trainer = paddle.trainer.SGD(
cost=cost, parameters=parameters, update_equation=optimizer)
# create data readers
feeding = {
"source_words": 0,
"target_words": 1,
"target_next_words": 2,
"bounded_memory_perturbation": 3
}
random.seed(0) # for keeping consitancy for multiple runs
bounded_memory_perturbation = [[
random.gauss(0, args.memory_perturb_stddev)
for i in xrange(args.hidden_size)
] for j in xrange(args.memory_slot_num)]
train_append_reader = reader_append_wrapper(
reader=paddle.dataset.wmt14.train(args.dict_size),
append_tuple=(bounded_memory_perturbation, ))
train_batch_reader = paddle.batch(
reader=paddle.reader.shuffle(reader=train_append_reader, buf_size=8192),
batch_size=args.batch_size)
test_append_reader = reader_append_wrapper(
reader=paddle.dataset.wmt14.test(args.dict_size),
append_tuple=(bounded_memory_perturbation, ))
test_batch_reader = paddle.batch(
reader=paddle.reader.shuffle(reader=test_append_reader, buf_size=8192),
batch_size=args.batch_size)
# create event handler
def event_handler(event):
if isinstance(event, paddle.event.EndIteration):
if event.batch_id % 10 == 0:
print "Pass: %d, Batch: %d, TrainCost: %f, %s" % (
event.pass_id, event.batch_id, event.cost, event.metrics)
with gzip.open("checkpoints/params.latest.tar.gz", 'w') as f:
parameters.to_tar(f)
else:
sys.stdout.write('.')
sys.stdout.flush()
if isinstance(event, paddle.event.EndPass):
result = trainer.test(reader=test_batch_reader, feeding=feeding)
print "Pass: %d, TestCost: %f, %s" % (event.pass_id, event.cost,
result.metrics)
with gzip.open("checkpoints/params.pass-%d.tar.gz" % event.pass_id,
'w') as f:
parameters.to_tar(f)
# run train
if not os.path.exists('checkpoints'):
os.mkdir('checkpoints')
trainer.train(
reader=train_batch_reader,
event_handler=event_handler,
num_passes=args.num_passes,
feeding=feeding)
def main():
paddle.init(use_gpu=args.use_gpu, trainer_count=args.trainer_count)
train()
if __name__ == '__main__':
main()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册