diff --git a/demo/seqToseq/api_train_v2.py b/demo/seqToseq/api_train_v2.py index 5b7506b152ccd6ecf776a047548956984910ce5d..6efd254e7a48703a69c9f09dd35d41ba7ac5689a 100644 --- a/demo/seqToseq/api_train_v2.py +++ b/demo/seqToseq/api_train_v2.py @@ -1,76 +1,106 @@ -import os - import paddle.v2 as paddle -from seqToseq_net_v2 import seqToseq_net_v2 - -# Data Definiation. -# TODO:This code should be merged to dataset package. -data_dir = "./data/pre-wmt14" -src_lang_dict = os.path.join(data_dir, 'src.dict') -trg_lang_dict = os.path.join(data_dir, 'trg.dict') - -source_dict_dim = len(open(src_lang_dict, "r").readlines()) -target_dict_dim = len(open(trg_lang_dict, "r").readlines()) - - -def read_to_dict(dict_path): - with open(dict_path, "r") as fin: - out_dict = { - line.strip(): line_count - for line_count, line in enumerate(fin) - } - return out_dict - - -src_dict = read_to_dict(src_lang_dict) -trg_dict = read_to_dict(trg_lang_dict) - -train_list = os.path.join(data_dir, 'train.list') -test_list = os.path.join(data_dir, 'test.list') - -UNK_IDX = 2 -START = "" -END = "" - -def _get_ids(s, dictionary): - words = s.strip().split() - return [dictionary[START]] + \ - [dictionary.get(w, UNK_IDX) for w in words] + \ - [dictionary[END]] - - -def train_reader(file_name): - def reader(): - with open(file_name, 'r') as f: - for line_count, line in enumerate(f): - line_split = line.strip().split('\t') - if len(line_split) != 2: - continue - src_seq = line_split[0] # one source sequence - src_ids = _get_ids(src_seq, src_dict) - - trg_seq = line_split[1] # one target sequence - trg_words = trg_seq.split() - trg_ids = [trg_dict.get(w, UNK_IDX) for w in trg_words] - - # remove sequence whose length > 80 in training mode - if len(src_ids) > 80 or len(trg_ids) > 80: - continue - trg_ids_next = trg_ids + [trg_dict[END]] - trg_ids = [trg_dict[START]] + trg_ids - - yield src_ids, trg_ids, trg_ids_next - - return reader +def seqToseq_net(source_dict_dim, target_dict_dim): + ### Network Architecture + word_vector_dim = 512 # dimension of word vector + decoder_size = 512 # dimension of hidden unit in GRU Decoder network + encoder_size = 512 # dimension of hidden unit in GRU Encoder network + + #### Encoder + src_word_id = paddle.layer.data( + name='source_language_word', + type=paddle.data_type.integer_value_sequence(source_dict_dim)) + src_embedding = paddle.layer.embedding( + input=src_word_id, + size=word_vector_dim, + param_attr=paddle.attr.ParamAttr(name='_source_language_embedding')) + src_forward = paddle.networks.simple_gru( + input=src_embedding, size=encoder_size) + src_backward = paddle.networks.simple_gru( + input=src_embedding, size=encoder_size, reverse=True) + encoded_vector = paddle.layer.concat(input=[src_forward, src_backward]) + + #### Decoder + with paddle.layer.mixed(size=decoder_size) as encoded_proj: + encoded_proj += paddle.layer.full_matrix_projection( + input=encoded_vector) + + backward_first = paddle.layer.first_seq(input=src_backward) + + with paddle.layer.mixed( + size=decoder_size, act=paddle.activation.Tanh()) as decoder_boot: + decoder_boot += paddle.layer.full_matrix_projection( + input=backward_first) + + def gru_decoder_with_attention(enc_vec, enc_proj, current_word): + + decoder_mem = paddle.layer.memory( + name='gru_decoder', size=decoder_size, boot_layer=decoder_boot) + + context = paddle.networks.simple_attention( + encoded_sequence=enc_vec, + encoded_proj=enc_proj, + decoder_state=decoder_mem) + + with paddle.layer.mixed(size=decoder_size * 3) as decoder_inputs: + decoder_inputs += paddle.layer.full_matrix_projection(input=context) + decoder_inputs += paddle.layer.full_matrix_projection( + input=current_word) + + gru_step = paddle.layer.gru_step( + name='gru_decoder', + input=decoder_inputs, + output_mem=decoder_mem, + size=decoder_size) + + with paddle.layer.mixed( + size=target_dict_dim, + bias_attr=True, + act=paddle.activation.Softmax()) as out: + out += paddle.layer.full_matrix_projection(input=gru_step) + return out + + decoder_group_name = "decoder_group" + group_input1 = paddle.layer.StaticInputV2(input=encoded_vector, is_seq=True) + group_input2 = paddle.layer.StaticInputV2(input=encoded_proj, is_seq=True) + group_inputs = [group_input1, group_input2] + + trg_embedding = paddle.layer.embedding( + input=paddle.layer.data( + name='target_language_word', + type=paddle.data_type.integer_value_sequence(target_dict_dim)), + size=word_vector_dim, + param_attr=paddle.attr.ParamAttr(name='_target_language_embedding')) + group_inputs.append(trg_embedding) + + # For decoder equipped with attention mechanism, in training, + # target embeding (the groudtruth) is the data input, + # while encoded source sequence is accessed to as an unbounded memory. + # Here, the StaticInput defines a read-only memory + # for the recurrent_group. + decoder = paddle.layer.recurrent_group( + name=decoder_group_name, + step=gru_decoder_with_attention, + input=group_inputs) + + lbl = paddle.layer.data( + name='target_language_next_word', + type=paddle.data_type.integer_value_sequence(target_dict_dim)) + cost = paddle.layer.classification_cost(input=decoder, label=lbl) + + return cost def main(): paddle.init(use_gpu=False, trainer_count=1) + # source and target dict dim. + dict_size = 30000 + source_dict_dim = target_dict_dim = dict_size + # define network topology - cost = seqToseq_net_v2(source_dict_dim, target_dict_dim) + cost = seqToseq_net(source_dict_dim, target_dict_dim) parameters = paddle.parameters.create(cost) # define optimize method and trainer @@ -88,7 +118,7 @@ def main(): wmt14_reader = paddle.batch( paddle.reader.shuffle( - train_reader("data/pre-wmt14/train/train"), buf_size=8192), + paddle.dataset.wmt14.train(dict_size=dict_size), buf_size=8192), batch_size=5) # define event_handler callback diff --git a/demo/seqToseq/seqToseq_net_v2.py b/demo/seqToseq/seqToseq_net_v2.py deleted file mode 100644 index 058a6789d7094c71492ed9772ed5594c4c0c8f84..0000000000000000000000000000000000000000 --- a/demo/seqToseq/seqToseq_net_v2.py +++ /dev/null @@ -1,92 +0,0 @@ -import paddle.v2 as paddle - - -def seqToseq_net_v2(source_dict_dim, target_dict_dim): - ### Network Architecture - word_vector_dim = 512 # dimension of word vector - decoder_size = 512 # dimension of hidden unit in GRU Decoder network - encoder_size = 512 # dimension of hidden unit in GRU Encoder network - - #### Encoder - src_word_id = paddle.layer.data( - name='source_language_word', - type=paddle.data_type.integer_value_sequence(source_dict_dim)) - src_embedding = paddle.layer.embedding( - input=src_word_id, - size=word_vector_dim, - param_attr=paddle.attr.ParamAttr(name='_source_language_embedding')) - src_forward = paddle.networks.simple_gru( - input=src_embedding, size=encoder_size) - src_backward = paddle.networks.simple_gru( - input=src_embedding, size=encoder_size, reverse=True) - encoded_vector = paddle.layer.concat(input=[src_forward, src_backward]) - - #### Decoder - with paddle.layer.mixed(size=decoder_size) as encoded_proj: - encoded_proj += paddle.layer.full_matrix_projection( - input=encoded_vector) - - backward_first = paddle.layer.first_seq(input=src_backward) - - with paddle.layer.mixed( - size=decoder_size, act=paddle.activation.Tanh()) as decoder_boot: - decoder_boot += paddle.layer.full_matrix_projection( - input=backward_first) - - def gru_decoder_with_attention(enc_vec, enc_proj, current_word): - - decoder_mem = paddle.layer.memory( - name='gru_decoder', size=decoder_size, boot_layer=decoder_boot) - - context = paddle.networks.simple_attention( - encoded_sequence=enc_vec, - encoded_proj=enc_proj, - decoder_state=decoder_mem) - - with paddle.layer.mixed(size=decoder_size * 3) as decoder_inputs: - decoder_inputs += paddle.layer.full_matrix_projection(input=context) - decoder_inputs += paddle.layer.full_matrix_projection( - input=current_word) - - gru_step = paddle.layer.gru_step( - name='gru_decoder', - input=decoder_inputs, - output_mem=decoder_mem, - size=decoder_size) - - with paddle.layer.mixed( - size=target_dict_dim, - bias_attr=True, - act=paddle.activation.Softmax()) as out: - out += paddle.layer.full_matrix_projection(input=gru_step) - return out - - decoder_group_name = "decoder_group" - group_input1 = paddle.layer.StaticInputV2(input=encoded_vector, is_seq=True) - group_input2 = paddle.layer.StaticInputV2(input=encoded_proj, is_seq=True) - group_inputs = [group_input1, group_input2] - - trg_embedding = paddle.layer.embedding( - input=paddle.layer.data( - name='target_language_word', - type=paddle.data_type.integer_value_sequence(target_dict_dim)), - size=word_vector_dim, - param_attr=paddle.attr.ParamAttr(name='_target_language_embedding')) - group_inputs.append(trg_embedding) - - # For decoder equipped with attention mechanism, in training, - # target embeding (the groudtruth) is the data input, - # while encoded source sequence is accessed to as an unbounded memory. - # Here, the StaticInput defines a read-only memory - # for the recurrent_group. - decoder = paddle.layer.recurrent_group( - name=decoder_group_name, - step=gru_decoder_with_attention, - input=group_inputs) - - lbl = paddle.layer.data( - name='target_language_next_word', - type=paddle.data_type.integer_value_sequence(target_dict_dim)) - cost = paddle.layer.classification_cost(input=decoder, label=lbl) - - return cost diff --git a/python/paddle/v2/dataset/wmt14.py b/python/paddle/v2/dataset/wmt14.py index 9904848b5d3ef95dc331fc0ba1a98f29f8b1dfeb..f5a16d51477f9cfbf0cd32af54098406fbbd2b41 100644 --- a/python/paddle/v2/dataset/wmt14.py +++ b/python/paddle/v2/dataset/wmt14.py @@ -14,129 +14,92 @@ """ wmt14 dataset """ -import paddle.v2.dataset.common import tarfile -import os.path -import itertools + +import paddle.v2.dataset.common __all__ = ['train', 'test', 'build_dict'] URL_DEV_TEST = 'http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/data/dev+test.tgz' MD5_DEV_TEST = '7d7897317ddd8ba0ae5c5fa7248d3ff5' -URL_TRAIN = 'http://localhost:8000/train.tgz' -MD5_TRAIN = '72de99da2830ea5a3a2c4eb36092bbc7' - - -def word_count(f, word_freq=None): - add = paddle.v2.dataset.common.dict_add - if word_freq == None: - word_freq = {} - - for l in f: - for w in l.strip().split(): - add(word_freq, w) - add(word_freq, '') - add(word_freq, '') - - return word_freq - - -def get_word_dix(word_freq): - TYPO_FREQ = 50 - word_freq = filter(lambda x: x[1] > TYPO_FREQ, word_freq.items()) - word_freq_sorted = sorted(word_freq, key=lambda x: (-x[1], x[0])) - words, _ = list(zip(*word_freq_sorted)) - word_idx = dict(zip(words, xrange(len(words)))) - word_idx[''] = len(words) - return word_idx - - -def get_word_freq(train, dev): - word_freq = word_count(train, word_count(dev)) - if '' in word_freq: - # remove for now, since we will set it as last index - del word_freq[''] - return word_freq - - -def build_dict(): - base_dir = './wmt14-data' - train_en_filename = base_dir + '/train/train.en' - train_fr_filename = base_dir + '/train/train.fr' - dev_en_filename = base_dir + '/dev/ntst1213.en' - dev_fr_filename = base_dir + '/dev/ntst1213.fr' - - if not os.path.exists(train_en_filename) or not os.path.exists( - train_fr_filename): - with tarfile.open( - paddle.v2.dataset.common.download(URL_TRAIN, 'wmt14', - MD5_TRAIN)) as tf: - tf.extractall(base_dir) - - if not os.path.exists(dev_en_filename) or not os.path.exists( - dev_fr_filename): - with tarfile.open( - paddle.v2.dataset.common.download(URL_DEV_TEST, 'wmt14', - MD5_DEV_TEST)) as tf: - tf.extractall(base_dir) - - f_en = open(train_en_filename) - f_fr = open(train_fr_filename) - f_en_dev = open(dev_en_filename) - f_fr_dev = open(dev_fr_filename) - - word_freq_en = get_word_freq(f_en, f_en_dev) - word_freq_fr = get_word_freq(f_fr, f_fr_dev) - - f_en.close() - f_fr.close() - f_en_dev.close() - f_fr_dev.close() - - return get_word_dix(word_freq_en), get_word_dix(word_freq_fr) - - -def reader_creator(directory, path_en, path_fr, URL, MD5, dict_en, dict_fr): +# this is a small set of data for test. The original data is too large and will be add later. +URL_TRAIN = 'http://paddlepaddle.bj.bcebos.com/demo/wmt_shrinked_data/wmt14.tgz' +MD5_TRAIN = 'a755315dd01c2c35bde29a744ede23a6' + +START = "" +END = "" +UNK = "" +UNK_IDX = 2 + + +def __read_to_dict__(tar_file, dict_size): + def __to_dict__(fd, size): + out_dict = dict() + for line_count, line in enumerate(fd): + if line_count < size: + out_dict[line.strip()] = line_count + else: + break + return out_dict + + with tarfile.open(tar_file, mode='r') as f: + names = [ + each_item.name for each_item in f + if each_item.name.endswith("src.dict") + ] + assert len(names) == 1 + src_dict = __to_dict__(f.extractfile(names[0]), dict_size) + names = [ + each_item.name for each_item in f + if each_item.name.endswith("trg.dict") + ] + assert len(names) == 1 + trg_dict = __to_dict__(f.extractfile(names[0]), dict_size) + return src_dict, trg_dict + + +def reader_creator(tar_file, file_name, dict_size): def reader(): - if not os.path.exists(path_en) or not os.path.exists(path_fr): - with tarfile.open( - paddle.v2.dataset.common.download(URL, 'wmt14', MD5)) as tf: - tf.extractall(directory) - - f_en = open(path_en) - f_fr = open(path_fr) - UNK_en = dict_en[''] - UNK_fr = dict_fr[''] - - for en, fr in itertools.izip(f_en, f_fr): - src_ids = [dict_en.get(w, UNK_en) for w in en.strip().split()] - tar_ids = [ - dict_fr.get(w, UNK_fr) - for w in [''] + fr.strip().split() + [''] + src_dict, trg_dict = __read_to_dict__(tar_file, dict_size) + with tarfile.open(tar_file, mode='r') as f: + names = [ + each_item.name for each_item in f + if each_item.name.endswith(file_name) ] - - # remove sequence whose length > 80 in training mode - if len(src_ids) == 0 or len(tar_ids) <= 1 or len( - src_ids) > 80 or len(tar_ids) > 80: - continue - - yield src_ids, tar_ids[:-1], tar_ids[1:] - - f_en.close() - f_fr.close() + for name in names: + for line in f.extractfile(name): + line_split = line.strip().split('\t') + if len(line_split) != 2: + continue + src_seq = line_split[0] # one source sequence + src_words = src_seq.split() + src_ids = [ + src_dict.get(w, UNK_IDX) + for w in [START] + src_words + [END] + ] + + trg_seq = line_split[1] # one target sequence + trg_words = trg_seq.split() + trg_ids = [trg_dict.get(w, UNK_IDX) for w in trg_words] + + # remove sequence whose length > 80 in training mode + if len(src_ids) > 80 or len(trg_ids) > 80: + continue + trg_ids_next = trg_ids + [trg_dict[END]] + trg_ids = [trg_dict[START]] + trg_ids + + yield src_ids, trg_ids, trg_ids_next return reader -def train(dict_en, dict_fr): - directory = './wmt14-data' - return reader_creator(directory, directory + '/train/train.en', - directory + '/train/train.fr', URL_TRAIN, MD5_TRAIN, - dict_en, dict_fr) +def train(dict_size): + return reader_creator( + paddle.v2.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN), + 'train/train', dict_size) -def test(dict_en, dict_fr): - directory = './wmt14-data' - return reader_creator(directory, directory + '/dev/ntst1213.en', - directory + '/dev/ntst1213.fr', URL_DEV_TEST, - MD5_DEV_TEST, dict_en, dict_fr) +def test(dict_size): + return reader_creator( + paddle.v2.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN), + 'test/test', dict_size)