diff --git a/demo/seqToseq/api_train_v2.py b/demo/seqToseq/api_train_v2.py new file mode 100644 index 0000000000000000000000000000000000000000..efbab8d7b953416b3b93b6642fb23089aca2a3e8 --- /dev/null +++ b/demo/seqToseq/api_train_v2.py @@ -0,0 +1,106 @@ +import os + +import paddle.v2 as paddle + +from seqToseq_net_v2 import seqToseq_net_v2 + +### Data Definiation +data_dir = "./data/pre-wmt14" +src_lang_dict = os.path.join(data_dir, 'src.dict') +trg_lang_dict = os.path.join(data_dir, 'trg.dict') + +source_dict_dim = len(open(src_lang_dict, "r").readlines()) +target_dict_dim = len(open(trg_lang_dict, "r").readlines()) + + +def read_to_dict(dict_path): + with open(dict_path, "r") as fin: + out_dict = { + line.strip(): line_count + for line_count, line in enumerate(fin) + } + return out_dict + + +src_dict = read_to_dict(src_lang_dict) +trg_dict = read_to_dict(trg_lang_dict) + +train_list = os.path.join(data_dir, 'train.list') +test_list = os.path.join(data_dir, 'test.list') + +UNK_IDX = 2 +START = "" +END = "" + + +def _get_ids(s, dictionary): + words = s.strip().split() + return [dictionary[START]] + \ + [dictionary.get(w, UNK_IDX) for w in words] + \ + [dictionary[END]] + + +def train_reader(file_name): + def reader(): + with open(file_name, 'r') as f: + for line_count, line in enumerate(f): + line_split = line.strip().split('\t') + if len(line_split) != 2: + continue + src_seq = line_split[0] # one source sequence + src_ids = _get_ids(src_seq, src_dict) + + trg_seq = line_split[1] # one target sequence + trg_words = trg_seq.split() + trg_ids = [trg_dict.get(w, UNK_IDX) for w in trg_words] + + # remove sequence whose length > 80 in training mode + if len(src_ids) > 80 or len(trg_ids) > 80: + continue + trg_ids_next = trg_ids + [trg_dict[END]] + trg_ids = [trg_dict[START]] + trg_ids + + yield src_ids, trg_ids, trg_ids_next + + return reader + + +def main(): + paddle.init(use_gpu=False, trainer_count=1) + + # reader = train_reader("data/pre-wmt14/train/train") + # define network topology + cost = seqToseq_net_v2(source_dict_dim, target_dict_dim) + parameters = paddle.parameters.create(cost) + optimizer = paddle.optimizer.Adam(batch_size=50, learning_rate=5e-4) + + def event_handler(event): + if isinstance(event, paddle.event.EndIteration): + if event.batch_id % 100 == 0: + print "Pass %d, Batch %d, Cost %f, %s" % ( + event.pass_id, event.batch_id, event.cost, event.metrics) + + trainer = paddle.trainer.SGD(cost=cost, + parameters=parameters, + update_equation=optimizer) + + reader_dict = { + 'source_language_word': 0, + 'target_language_word': 1, + 'target_language_next_word': 2 + } + + trn_reader = paddle.reader.batched( + paddle.reader.shuffle( + train_reader("data/pre-wmt14/train/train"), buf_size=8192), + batch_size=10) + + trainer.train( + reader=trn_reader, + event_handler=event_handler, + num_passes=10000, + reader_dict=reader_dict) + + +if __name__ == '__main__': + main() diff --git a/demo/seqToseq/seqToseq_net_v2.py b/demo/seqToseq/seqToseq_net_v2.py new file mode 100644 index 0000000000000000000000000000000000000000..7e057e24405f8ffc8006c2d24fb4183449641526 --- /dev/null +++ b/demo/seqToseq/seqToseq_net_v2.py @@ -0,0 +1,90 @@ +import paddle.v2.activation as activation +import paddle.v2.attr as attr +import paddle.v2.data_type as data_type +import paddle.v2.layer as layer +import paddle.v2.networks as networks + + +def seqToseq_net_v2(source_dict_dim, target_dict_dim): + ### Network Architecture + word_vector_dim = 512 # dimension of word vector + decoder_size = 512 # dimension of hidden unit in GRU Decoder network + encoder_size = 512 # dimension of hidden unit in GRU Encoder network + + #### Encoder + src_word_id = layer.data( + name='source_language_word', + type=data_type.dense_vector(source_dict_dim)) + src_embedding = layer.embedding( + input=src_word_id, + size=word_vector_dim, + param_attr=attr.ParamAttr(name='_source_language_embedding')) + src_forward = networks.simple_gru(input=src_embedding, size=encoder_size) + src_backward = networks.simple_gru( + input=src_embedding, size=encoder_size, reverse=True) + encoded_vector = layer.concat(input=[src_forward, src_backward]) + + #### Decoder + with layer.mixed(size=decoder_size) as encoded_proj: + encoded_proj += layer.full_matrix_projection(input=encoded_vector) + + backward_first = layer.first_seq(input=src_backward) + + with layer.mixed(size=decoder_size, act=activation.Tanh()) as decoder_boot: + decoder_boot += layer.full_matrix_projection(input=backward_first) + + def gru_decoder_with_attention(enc_vec, enc_proj, current_word): + + decoder_mem = layer.memory( + name='gru_decoder', size=decoder_size, boot_layer=decoder_boot) + + context = networks.simple_attention( + encoded_sequence=enc_vec, + encoded_proj=enc_proj, + decoder_state=decoder_mem) + + with layer.mixed(size=decoder_size * 3) as decoder_inputs: + decoder_inputs += layer.full_matrix_projection(input=context) + decoder_inputs += layer.full_matrix_projection(input=current_word) + + gru_step = layer.gru_step( + name='gru_decoder', + input=decoder_inputs, + output_mem=decoder_mem, + size=decoder_size) + + with layer.mixed( + size=target_dict_dim, bias_attr=True, + act=activation.Softmax()) as out: + out += layer.full_matrix_projection(input=gru_step) + return out + + decoder_group_name = "decoder_group" + group_input1 = layer.StaticInputV2(input=encoded_vector, is_seq=True) + group_input2 = layer.StaticInputV2(input=encoded_proj, is_seq=True) + group_inputs = [group_input1, group_input2] + + trg_embedding = layer.embedding( + input=layer.data( + name='target_language_word', + type=data_type.dense_vector(target_dict_dim)), + size=word_vector_dim, + param_attr=attr.ParamAttr(name='_target_language_embedding')) + group_inputs.append(trg_embedding) + + # For decoder equipped with attention mechanism, in training, + # target embeding (the groudtruth) is the data input, + # while encoded source sequence is accessed to as an unbounded memory. + # Here, the StaticInput defines a read-only memory + # for the recurrent_group. + decoder = layer.recurrent_group( + name=decoder_group_name, + step=gru_decoder_with_attention, + input=group_inputs) + + lbl = layer.data( + name='target_language_next_word', + type=data_type.dense_vector(target_dict_dim)) + cost = layer.classification_cost(input=decoder, label=lbl) + + return cost diff --git a/python/paddle/v2/layer.py b/python/paddle/v2/layer.py index 2f55611aaa1d3ae22f5d7f184b38e622271881ea..010773ddbd96d4226cccc1a63cfc133b78bdcffe 100644 --- a/python/paddle/v2/layer.py +++ b/python/paddle/v2/layer.py @@ -262,7 +262,7 @@ class StaticInputV2(object): self.input = input self.is_seq = is_seq self.size = size - # TODO(qiaolongfei): add size + # TODO(add size check) # assert input.size is not None or size is not None diff --git a/python/paddle/v2/topology.py b/python/paddle/v2/topology.py index 4c211254319bbdf46b02a2cee56b6a98b01819a2..f0679c5675b0c0f24f28f3df22efd4eb51ccbb3a 100644 --- a/python/paddle/v2/topology.py +++ b/python/paddle/v2/topology.py @@ -17,6 +17,7 @@ import collections from paddle.proto.ModelConfig_pb2 import ModelConfig import layer as v2_layer +from layer import WithExtraParent __all__ = ['Topology'] @@ -40,7 +41,10 @@ def __bfs_travel__(callback, *layers): __break__ = callback(each_layer) if __break__: return - __bfs_travel__(callback, *each_layer.__parent_layers__.values()) + __layers__ = each_layer.__parent_layers__.values() + if isinstance(each_layer, WithExtraParent): + __layers__ = __layers__ + each_layer.extra_parent() + __bfs_travel__(callback, *__layers__) class Topology(object):