提交 b22cd96a 编写于 作者: J jacquesqiao 提交者: GitHub

Merge pull request #1761 from jacquesqiao/beam_search

support Beam search in v2 api
import sys import sys
import paddle.v2 as paddle import paddle.v2 as paddle
def seqToseq_net(source_dict_dim, target_dict_dim): def seqToseq_net(source_dict_dim, target_dict_dim, is_generating=False):
### Network Architecture ### Network Architecture
word_vector_dim = 512 # dimension of word vector word_vector_dim = 512 # dimension of word vector
decoder_size = 512 # dimension of hidden unit in GRU Decoder network decoder_size = 512 # dimension of hidden unit in GRU Decoder network
encoder_size = 512 # dimension of hidden unit in GRU Encoder network encoder_size = 512 # dimension of hidden unit in GRU Encoder network
beam_size = 3
max_length = 250
#### Encoder #### Encoder
src_word_id = paddle.layer.data( src_word_id = paddle.layer.data(
name='source_language_word', name='source_language_word',
...@@ -67,30 +71,57 @@ def seqToseq_net(source_dict_dim, target_dict_dim): ...@@ -67,30 +71,57 @@ def seqToseq_net(source_dict_dim, target_dict_dim):
group_input2 = paddle.layer.StaticInputV2(input=encoded_proj, is_seq=True) group_input2 = paddle.layer.StaticInputV2(input=encoded_proj, is_seq=True)
group_inputs = [group_input1, group_input2] group_inputs = [group_input1, group_input2]
trg_embedding = paddle.layer.embedding( if not is_generating:
input=paddle.layer.data( trg_embedding = paddle.layer.embedding(
name='target_language_word', input=paddle.layer.data(
type=paddle.data_type.integer_value_sequence(target_dict_dim)), name='target_language_word',
size=word_vector_dim, type=paddle.data_type.integer_value_sequence(target_dict_dim)),
param_attr=paddle.attr.ParamAttr(name='_target_language_embedding')) size=word_vector_dim,
group_inputs.append(trg_embedding) param_attr=paddle.attr.ParamAttr(name='_target_language_embedding'))
group_inputs.append(trg_embedding)
# For decoder equipped with attention mechanism, in training,
# target embeding (the groudtruth) is the data input, # For decoder equipped with attention mechanism, in training,
# while encoded source sequence is accessed to as an unbounded memory. # target embeding (the groudtruth) is the data input,
# Here, the StaticInput defines a read-only memory # while encoded source sequence is accessed to as an unbounded memory.
# for the recurrent_group. # Here, the StaticInput defines a read-only memory
decoder = paddle.layer.recurrent_group( # for the recurrent_group.
name=decoder_group_name, decoder = paddle.layer.recurrent_group(
step=gru_decoder_with_attention, name=decoder_group_name,
input=group_inputs) step=gru_decoder_with_attention,
input=group_inputs)
lbl = paddle.layer.data(
name='target_language_next_word', lbl = paddle.layer.data(
type=paddle.data_type.integer_value_sequence(target_dict_dim)) name='target_language_next_word',
cost = paddle.layer.classification_cost(input=decoder, label=lbl) type=paddle.data_type.integer_value_sequence(target_dict_dim))
cost = paddle.layer.classification_cost(input=decoder, label=lbl)
return cost
return cost
else:
# In generation, the decoder predicts a next target word based on
# the encoded source sequence and the last generated target word.
# The encoded source sequence (encoder's output) must be specified by
# StaticInput, which is a read-only memory.
# Embedding of the last generated word is automatically gotten by
# GeneratedInputs, which is initialized by a start mark, such as <s>,
# and must be included in generation.
trg_embedding = paddle.layer.GeneratedInputV2(
size=target_dict_dim,
embedding_name='_target_language_embedding',
embedding_size=word_vector_dim)
group_inputs.append(trg_embedding)
beam_gen = paddle.layer.beam_search(
name=decoder_group_name,
step=gru_decoder_with_attention,
input=group_inputs,
bos_id=0,
eos_id=1,
beam_size=beam_size,
max_length=max_length)
return beam_gen
def main(): def main():
......
...@@ -67,7 +67,16 @@ class Layer(object): ...@@ -67,7 +67,16 @@ class Layer(object):
self.name = name self.name = name
self.__context__ = {} self.__context__ = {}
self.__parent_layers__ = parent_layers self.__parent_layers__ = parent_layers
self.__children_layers__ = [] # used for evaluator. # some layer may have some extra parent layer
self.__extra_parent__ = []
# used for evaluator.
self.__children_layers__ = []
def extra_parent(self):
return self.__extra_parent__
def append_extra_parent(self, parent):
self.__extra_parent__.append(parent)
def append_child(self, layer, parent_names): def append_child(self, layer, parent_names):
self.__children_layers__.append((layer, parent_names)) self.__children_layers__.append((layer, parent_names))
...@@ -78,14 +87,20 @@ class Layer(object): ...@@ -78,14 +87,20 @@ class Layer(object):
""" """
self.__context__ = context self.__context__ = context
# short cut if myself is parsed before. # STEP: short cut if this layer is parsed before.
if self.context_name() in context: if self.context_name() in context:
if self.use_context_name(): if self.use_context_name():
return context[self.context_name()] return context[self.context_name()]
else: else:
return context[self.name] return context[self.name]
# parse parent before myself # STEP: parse extra_parent that is not used by this layer but must
# be parsed before this layer.
for p in self.__extra_parent__:
p.to_proto(context=context)
# STEP: parse parent that is used by this layer, get the result and
# insert into kwargs of the next layer's to_proto_impl method.
kwargs = dict() kwargs = dict()
for layer_name in self.__parent_layers__: for layer_name in self.__parent_layers__:
if not isinstance(self.__parent_layers__[layer_name], if not isinstance(self.__parent_layers__[layer_name],
...@@ -97,14 +112,13 @@ class Layer(object): ...@@ -97,14 +112,13 @@ class Layer(object):
self.__parent_layers__[layer_name]) self.__parent_layers__[layer_name])
kwargs[layer_name] = v1_layer kwargs[layer_name] = v1_layer
# parse myself. # STEP: parse myself and add myself into context.
ret_val = self.to_proto_impl(**kwargs) ret_val = self.to_proto_impl(**kwargs)
if self.context_name() is not None \
if self.context_name() is not None and \ and self.context_name() not in context:
self.context_name() not in context:
context[self.context_name()] = ret_val context[self.context_name()] = ret_val
# parse children. # STEP: parse children that should be pased after this layer.
for layer, pnames in self.__children_layers__: for layer, pnames in self.__children_layers__:
drop = False drop = False
...@@ -117,6 +131,7 @@ class Layer(object): ...@@ -117,6 +131,7 @@ class Layer(object):
continue continue
layer.to_proto(context=context) layer.to_proto(context=context)
# STEP: return v1 layer result
if self.context_name() is None: if self.context_name() is None:
return ret_val return ret_val
elif self.use_context_name(): elif self.use_context_name():
......
...@@ -33,22 +33,25 @@ The primary usage shows below. ...@@ -33,22 +33,25 @@ The primary usage shows below.
import collections import collections
import inspect import inspect
from config_base import Layer, __convert_to_v2__ import re
import paddle.trainer_config_helpers as conf_helps import paddle.trainer_config_helpers as conf_helps
from paddle.trainer.config_parser import \
RecurrentLayerGroupWithoutOutLinksBegin, RecurrentLayerGroupSetOutLink, \
RecurrentLayerGroupEnd, model_type
from paddle.trainer_config_helpers.config_parser_utils import \ from paddle.trainer_config_helpers.config_parser_utils import \
parse_network_config as __parse__ parse_network_config as __parse__
from paddle.trainer_config_helpers.default_decorators import wrap_act_default from paddle.trainer_config_helpers.default_decorators import wrap_act_default
from paddle.trainer_config_helpers.default_decorators import \ from paddle.trainer_config_helpers.default_decorators import \
wrap_bias_attr_default wrap_bias_attr_default
from paddle.trainer_config_helpers.default_decorators import wrap_name_default from paddle.trainer_config_helpers.default_decorators import wrap_name_default
from paddle.trainer_config_helpers.layers import RecurrentLayerGroupSetGenerator, Generator
from paddle.trainer_config_helpers.layers import layer_support from paddle.trainer_config_helpers.layers import layer_support
from paddle.trainer.config_parser import \
RecurrentLayerGroupWithoutOutLinksBegin, RecurrentLayerGroupSetOutLink, \
RecurrentLayerGroupEnd, model_type
import activation import activation
import re import attr
import data_type import data_type
from config_base import Layer, __convert_to_v2__
__all__ = ['parse_network', 'data'] __all__ = ['parse_network', 'data']
...@@ -132,54 +135,23 @@ class DataLayerV2(Layer): ...@@ -132,54 +135,23 @@ class DataLayerV2(Layer):
return doc return doc
class WithExtraParent(Layer): class MemoryV2(Layer):
def extra_parent(self): def __init__(self, name, extra_input=None, **kwargs):
return self.__extra_parent__
def __init__(self, name=None, parent_layers=None):
self.__extra_parent__ = []
super(WithExtraParent, self).__init__(
name=name, parent_layers=parent_layers)
def append_extra_parent(self, parent):
self.__extra_parent__.append(parent)
def to_proto(self, context):
""" """
function to set proto attribute Init memory object, if memory is inited inside recurrent_group step
function, it may depend on a boot_layer that should be initialized
outside recurrent_group, so we:
1. add RecurrentLayerInput to extra_parent of self.
2. add boot_layer to the extra_parent of RecurrentLayerInput.
:param extra_input: list of RecurrentLayerInput
:type extra_input: [RecurrentLayerInput]
""" """
kwargs = dict()
for p in self.__extra_parent__:
p.to_proto(context=context)
for layer_name in self.__parent_layers__:
if not isinstance(self.__parent_layers__[layer_name],
collections.Sequence):
v1_layer = self.__parent_layers__[layer_name].to_proto(
context=context)
else:
v1_layer = map(lambda x: x.to_proto(context=context),
self.__parent_layers__[layer_name])
kwargs[layer_name] = v1_layer
if self.context_name() is None:
return self.to_proto_impl(context=context, **kwargs)
elif self.context_name() not in context:
context[self.context_name()] = self.to_proto_impl(
context=context, **kwargs)
if self.use_context_name():
return context[self.context_name()]
else:
return context[self.name]
class MemoryV2(WithExtraParent):
def __init__(self, name, **kwargs):
self.name = name self.name = name
super(MemoryV2, self).__init__(name=name, parent_layers=dict()) super(MemoryV2, self).__init__(name=name, parent_layers=dict())
self.__kwargs__ = kwargs self.__kwargs__ = kwargs
self.__boot_layer_name__ = None self.__boot_layer_name__ = None
if 'boot_layer' in kwargs: if 'boot_layer' in kwargs:
begin_of_current_rnn = [] begin_of_current_rnn = []
# TODO(yuyang18): Fix inspect, it could be wrong when user invoke a # TODO(yuyang18): Fix inspect, it could be wrong when user invoke a
...@@ -202,11 +174,10 @@ class MemoryV2(WithExtraParent): ...@@ -202,11 +174,10 @@ class MemoryV2(WithExtraParent):
assert begin_of_current_rnn is not None assert begin_of_current_rnn is not None
for extra in begin_of_current_rnn: for extra in begin_of_current_rnn:
self.append_extra_parent(extra) self.append_extra_parent(extra)
assert isinstance(extra, WithExtraParent)
extra.append_extra_parent(kwargs['boot_layer']) extra.append_extra_parent(kwargs['boot_layer'])
self.__boot_layer_name__ = kwargs['boot_layer'].name self.__boot_layer_name__ = kwargs['boot_layer'].name
def to_proto_impl(self, context, **kwargs): def to_proto_impl(self, **kwargs):
args = dict() args = dict()
for each in kwargs: for each in kwargs:
args[each] = kwargs[each] args[each] = kwargs[each]
...@@ -214,7 +185,7 @@ class MemoryV2(WithExtraParent): ...@@ -214,7 +185,7 @@ class MemoryV2(WithExtraParent):
args[each] = self.__kwargs__[each] args[each] = self.__kwargs__[each]
if self.__boot_layer_name__ is not None: if self.__boot_layer_name__ is not None:
args['boot_layer'] = context[self.__boot_layer_name__] args['boot_layer'] = self.__context__[self.__boot_layer_name__]
size = args.get('size', None) size = args.get('size', None)
if size is not None: if size is not None:
...@@ -236,22 +207,6 @@ class MemoryV2(WithExtraParent): ...@@ -236,22 +207,6 @@ class MemoryV2(WithExtraParent):
return True return True
class LayerOutputV2(Layer):
"""
LayerOutputV2 is used to store the result of LayerOutput in v1 api.
It will not store it's parents because layer_output has been parsed already.
"""
def __init__(self, layer_output):
assert isinstance(layer_output, conf_helps.LayerOutput)
self.layer_output = layer_output
super(LayerOutputV2, self).__init__(
name=layer_output.name, parent_layers=dict())
def to_proto_impl(self):
return self.layer_output
class StaticInputV2(object): class StaticInputV2(object):
def __init__(self, input, is_seq=False, size=None): def __init__(self, input, is_seq=False, size=None):
assert isinstance(input, LayerV2) assert isinstance(input, LayerV2)
...@@ -263,6 +218,66 @@ class StaticInputV2(object): ...@@ -263,6 +218,66 @@ class StaticInputV2(object):
# assert input.size is not None or size is not None # assert input.size is not None or size is not None
class BaseGeneratedInputV2(object):
def __init__(self):
self.bos_id = None
self.eos_id = None
def before_real_step(self):
raise NotImplementedError()
def after_real_step(self, *args):
raise NotImplementedError()
class GeneratedInputV2(BaseGeneratedInputV2):
def __init__(self, size, embedding_name, embedding_size):
super(GeneratedInputV2, self).__init__()
self.size = size
self.embedding_name = embedding_name
self.embedding_size = embedding_size
def after_real_step(self, input):
return max_id(input=input, name='__beam_search_predict__')
def before_real_step(self):
predict_id = memory(
name='__beam_search_predict__',
size=self.size,
boot_with_const_id=self.bos_id)
trg_emb = embedding(
input=predict_id,
size=self.embedding_size,
param_attr=attr.ParamAttr(name=self.embedding_name))
return trg_emb
class RecurrentLayerGroupSetGeneratorV2(Layer):
def __init__(self, eos_name, max_length, beam_size, num_results_per_sample):
self.eos_name = eos_name
self.max_length = max_length
self.beam_size = beam_size
self.num_results_per_sample = num_results_per_sample
super(RecurrentLayerGroupSetGeneratorV2, self).__init__(
name=eos_name, parent_layers={})
def to_proto_impl(self, **kwargs):
RecurrentLayerGroupSetGenerator(
Generator(
eos_layer_name=self.eos_name,
max_num_frames=self.max_length,
beam_size=self.beam_size,
num_results_per_sample=self.num_results_per_sample))
return self
def context_name(self):
return self.eos_name + ".fake"
def use_context_name(self):
return True
class MixedLayerV2(Layer): class MixedLayerV2(Layer):
""" """
This class is use to support `with` grammar. If not, the following code This class is use to support `with` grammar. If not, the following code
...@@ -341,18 +356,24 @@ def mixed(size=0, ...@@ -341,18 +356,24 @@ def mixed(size=0,
return MixedLayerV2(size, input, name, act, bias_attr, layer_attr) return MixedLayerV2(size, input, name, act, bias_attr, layer_attr)
class RecurrentLayerInput(WithExtraParent): class RecurrentLayerInput(Layer):
def __init__(self, recurrent_name, index, parent_layers): def __init__(self, recurrent_name, index, parent_layers):
assert len(parent_layers) == 1 parents_len = len(parent_layers)
self.__parents__ = parent_layers.values()[0] assert parents_len <= 1
super(RecurrentLayerInput, self).__init__( if parents_len == 0:
name=self.__parents__[index].name, parent_layers=parent_layers) self.__parents__ = []
else:
self.__parents__ = parent_layers.values()[0]
self.__recurrent_name__ = recurrent_name self.__recurrent_name__ = recurrent_name
name = self.__parents__[
index].name if index >= 0 else self.context_name()
super(RecurrentLayerInput, self).__init__(
name=name, parent_layers=parent_layers)
def context_name(self): def context_name(self):
return self.__recurrent_name__ + ".begin" return self.__recurrent_name__ + ".begin"
def to_proto_impl(self, context, **kwargs): def to_proto_impl(self, **kwargs):
model_type('recurrent_nn') model_type('recurrent_nn')
RecurrentLayerGroupWithoutOutLinksBegin( RecurrentLayerGroupWithoutOutLinksBegin(
name=self.__recurrent_name__, name=self.__recurrent_name__,
...@@ -449,6 +470,11 @@ def recurrent_group(step, input, name=None): ...@@ -449,6 +470,11 @@ def recurrent_group(step, input, name=None):
for i in xrange(len(non_static_inputs)) for i in xrange(len(non_static_inputs))
] ]
extra_input = None
if len(non_static_inputs) == 0:
extra_input = RecurrentLayerInput(
recurrent_name=name, index=-1, parent_layers={})
def __real_step__(*args): def __real_step__(*args):
rnn_input = list(args) rnn_input = list(args)
static_inputs = filter(lambda x: isinstance(x, StaticInputV2), input) static_inputs = filter(lambda x: isinstance(x, StaticInputV2), input)
...@@ -456,6 +482,7 @@ def recurrent_group(step, input, name=None): ...@@ -456,6 +482,7 @@ def recurrent_group(step, input, name=None):
mem_name = "__%s_memory__" % static_input.input.name mem_name = "__%s_memory__" % static_input.input.name
mem = memory( mem = memory(
name=mem_name, name=mem_name,
extra_input=extra_input,
is_seq=static_input.is_seq, is_seq=static_input.is_seq,
size=static_input.input.calculate_size, size=static_input.input.calculate_size,
boot_layer=static_input.input) boot_layer=static_input.input)
...@@ -485,6 +512,73 @@ def recurrent_group(step, input, name=None): ...@@ -485,6 +512,73 @@ def recurrent_group(step, input, name=None):
return retv return retv
@wrap_name_default()
def beam_search(step,
input,
bos_id,
eos_id,
beam_size,
max_length=500,
name=None,
num_results_per_sample=None):
if num_results_per_sample is None:
num_results_per_sample = beam_size
assert num_results_per_sample <= beam_size
# logger.warning("num_results_per_sample should be less than beam_size")
if isinstance(input, StaticInputV2) or isinstance(input,
BaseGeneratedInputV2):
input = [input]
generated_input_index = -1
real_input = []
for i, each_input in enumerate(input):
assert isinstance(each_input, StaticInputV2) or isinstance(
each_input, BaseGeneratedInputV2)
if isinstance(each_input, BaseGeneratedInputV2):
assert generated_input_index == -1
generated_input_index = i
else:
real_input.append(each_input)
assert generated_input_index != -1
gipt = input[generated_input_index]
assert isinstance(gipt, BaseGeneratedInputV2)
gipt.bos_id = bos_id
gipt.eos_id = eos_id
def __real_step__(*args):
eos_name = "__%s_eos_layer__" % name
generator = RecurrentLayerGroupSetGeneratorV2(
eos_name, max_length, beam_size, num_results_per_sample)
args = list(args)
before_step_layer = gipt.before_real_step()
before_step_layer.append_child(
layer=generator, parent_names=[before_step_layer.name])
args.insert(generated_input_index, before_step_layer)
predict = gipt.after_real_step(step(*args))
eos_layer = eos(input=predict, eos_id=eos_id, name=eos_name)
predict.append_child(layer=eos_layer, parent_names=[predict.name])
return predict
# tmp = paddle.layer.recurrent_group(
# step=__real_step__,
# input=real_input,
# reverse=False,
# name=name,
# is_generating=True)
tmp = recurrent_group(step=__real_step__, input=real_input, name=name)
return tmp
__projection_names__ = filter(lambda x: x.endswith('_projection'), __projection_names__ = filter(lambda x: x.endswith('_projection'),
dir(conf_helps)) dir(conf_helps))
......
...@@ -17,7 +17,6 @@ import collections ...@@ -17,7 +17,6 @@ import collections
from paddle.proto.ModelConfig_pb2 import ModelConfig from paddle.proto.ModelConfig_pb2 import ModelConfig
import layer as v2_layer import layer as v2_layer
from layer import WithExtraParent
__all__ = ['Topology'] __all__ = ['Topology']
...@@ -41,9 +40,8 @@ def __bfs_travel__(callback, *layers): ...@@ -41,9 +40,8 @@ def __bfs_travel__(callback, *layers):
__break__ = callback(each_layer) __break__ = callback(each_layer)
if __break__: if __break__:
return return
__layers__ = each_layer.__parent_layers__.values() __layers__ = each_layer.__parent_layers__.values() + \
if isinstance(each_layer, WithExtraParent): each_layer.extra_parent()
__layers__ = __layers__ + each_layer.extra_parent()
__bfs_travel__(callback, *__layers__) __bfs_travel__(callback, *__layers__)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册