From 98522dcb3390ecbccc4c544e0c9c57b8d3e4b2c9 Mon Sep 17 00:00:00 2001 From: qiaolongfei Date: Sun, 5 Mar 2017 21:26:44 +0800 Subject: [PATCH] optimizer wmt14 dataset --- demo/seqToseq/api_train_v2.py | 161 +++++++++++++++----------- demo/seqToseq/seqToseq_net_v2.py | 92 --------------- python/paddle/v2/dataset/wmt14.py | 185 +++++++++++++----------------- 3 files changed, 174 insertions(+), 264 deletions(-) delete mode 100644 demo/seqToseq/seqToseq_net_v2.py diff --git a/demo/seqToseq/api_train_v2.py b/demo/seqToseq/api_train_v2.py index a5f59ec3797..f100ef80cb1 100644 --- a/demo/seqToseq/api_train_v2.py +++ b/demo/seqToseq/api_train_v2.py @@ -1,76 +1,106 @@ -import os - import paddle.v2 as paddle -from seqToseq_net_v2 import seqToseq_net_v2 - -# Data Definiation. -# TODO:This code should be merged to dataset package. -data_dir = "./data/pre-wmt14" -src_lang_dict = os.path.join(data_dir, 'src.dict') -trg_lang_dict = os.path.join(data_dir, 'trg.dict') - -source_dict_dim = len(open(src_lang_dict, "r").readlines()) -target_dict_dim = len(open(trg_lang_dict, "r").readlines()) - - -def read_to_dict(dict_path): - with open(dict_path, "r") as fin: - out_dict = { - line.strip(): line_count - for line_count, line in enumerate(fin) - } - return out_dict - - -src_dict = read_to_dict(src_lang_dict) -trg_dict = read_to_dict(trg_lang_dict) - -train_list = os.path.join(data_dir, 'train.list') -test_list = os.path.join(data_dir, 'test.list') - -UNK_IDX = 2 -START = "" -END = "" - -def _get_ids(s, dictionary): - words = s.strip().split() - return [dictionary[START]] + \ - [dictionary.get(w, UNK_IDX) for w in words] + \ - [dictionary[END]] - - -def train_reader(file_name): - def reader(): - with open(file_name, 'r') as f: - for line_count, line in enumerate(f): - line_split = line.strip().split('\t') - if len(line_split) != 2: - continue - src_seq = line_split[0] # one source sequence - src_ids = _get_ids(src_seq, src_dict) - - trg_seq = line_split[1] # one target sequence - trg_words = trg_seq.split() - trg_ids = [trg_dict.get(w, UNK_IDX) for w in trg_words] - - # remove sequence whose length > 80 in training mode - if len(src_ids) > 80 or len(trg_ids) > 80: - continue - trg_ids_next = trg_ids + [trg_dict[END]] - trg_ids = [trg_dict[START]] + trg_ids - - yield src_ids, trg_ids, trg_ids_next - - return reader +def seqToseq_net(source_dict_dim, target_dict_dim): + ### Network Architecture + word_vector_dim = 512 # dimension of word vector + decoder_size = 512 # dimension of hidden unit in GRU Decoder network + encoder_size = 512 # dimension of hidden unit in GRU Encoder network + + #### Encoder + src_word_id = paddle.layer.data( + name='source_language_word', + type=paddle.data_type.integer_value_sequence(source_dict_dim)) + src_embedding = paddle.layer.embedding( + input=src_word_id, + size=word_vector_dim, + param_attr=paddle.attr.ParamAttr(name='_source_language_embedding')) + src_forward = paddle.networks.simple_gru( + input=src_embedding, size=encoder_size) + src_backward = paddle.networks.simple_gru( + input=src_embedding, size=encoder_size, reverse=True) + encoded_vector = paddle.layer.concat(input=[src_forward, src_backward]) + + #### Decoder + with paddle.layer.mixed(size=decoder_size) as encoded_proj: + encoded_proj += paddle.layer.full_matrix_projection( + input=encoded_vector) + + backward_first = paddle.layer.first_seq(input=src_backward) + + with paddle.layer.mixed( + size=decoder_size, act=paddle.activation.Tanh()) as decoder_boot: + decoder_boot += paddle.layer.full_matrix_projection( + input=backward_first) + + def gru_decoder_with_attention(enc_vec, enc_proj, current_word): + + decoder_mem = paddle.layer.memory( + name='gru_decoder', size=decoder_size, boot_layer=decoder_boot) + + context = paddle.networks.simple_attention( + encoded_sequence=enc_vec, + encoded_proj=enc_proj, + decoder_state=decoder_mem) + + with paddle.layer.mixed(size=decoder_size * 3) as decoder_inputs: + decoder_inputs += paddle.layer.full_matrix_projection(input=context) + decoder_inputs += paddle.layer.full_matrix_projection( + input=current_word) + + gru_step = paddle.layer.gru_step( + name='gru_decoder', + input=decoder_inputs, + output_mem=decoder_mem, + size=decoder_size) + + with paddle.layer.mixed( + size=target_dict_dim, + bias_attr=True, + act=paddle.activation.Softmax()) as out: + out += paddle.layer.full_matrix_projection(input=gru_step) + return out + + decoder_group_name = "decoder_group" + group_input1 = paddle.layer.StaticInputV2(input=encoded_vector, is_seq=True) + group_input2 = paddle.layer.StaticInputV2(input=encoded_proj, is_seq=True) + group_inputs = [group_input1, group_input2] + + trg_embedding = paddle.layer.embedding( + input=paddle.layer.data( + name='target_language_word', + type=paddle.data_type.integer_value_sequence(target_dict_dim)), + size=word_vector_dim, + param_attr=paddle.attr.ParamAttr(name='_target_language_embedding')) + group_inputs.append(trg_embedding) + + # For decoder equipped with attention mechanism, in training, + # target embeding (the groudtruth) is the data input, + # while encoded source sequence is accessed to as an unbounded memory. + # Here, the StaticInput defines a read-only memory + # for the recurrent_group. + decoder = paddle.layer.recurrent_group( + name=decoder_group_name, + step=gru_decoder_with_attention, + input=group_inputs) + + lbl = paddle.layer.data( + name='target_language_next_word', + type=paddle.data_type.integer_value_sequence(target_dict_dim)) + cost = paddle.layer.classification_cost(input=decoder, label=lbl) + + return cost def main(): paddle.init(use_gpu=False, trainer_count=1) + # source and target dict dim. + dict_size = 30000 + source_dict_dim = target_dict_dim = dict_size + # define network topology - cost = seqToseq_net_v2(source_dict_dim, target_dict_dim) + cost = seqToseq_net(source_dict_dim, target_dict_dim) parameters = paddle.parameters.create(cost) # define optimize method and trainer @@ -85,10 +115,9 @@ def main(): 'target_language_word': 1, 'target_language_next_word': 2 } - wmt14_reader = paddle.reader.batched( paddle.reader.shuffle( - train_reader("data/pre-wmt14/train/train"), buf_size=8192), + paddle.dataset.wmt14.train(dict_size=dict_size), buf_size=8192), batch_size=5) # define event_handler callback diff --git a/demo/seqToseq/seqToseq_net_v2.py b/demo/seqToseq/seqToseq_net_v2.py deleted file mode 100644 index 058a6789d70..00000000000 --- a/demo/seqToseq/seqToseq_net_v2.py +++ /dev/null @@ -1,92 +0,0 @@ -import paddle.v2 as paddle - - -def seqToseq_net_v2(source_dict_dim, target_dict_dim): - ### Network Architecture - word_vector_dim = 512 # dimension of word vector - decoder_size = 512 # dimension of hidden unit in GRU Decoder network - encoder_size = 512 # dimension of hidden unit in GRU Encoder network - - #### Encoder - src_word_id = paddle.layer.data( - name='source_language_word', - type=paddle.data_type.integer_value_sequence(source_dict_dim)) - src_embedding = paddle.layer.embedding( - input=src_word_id, - size=word_vector_dim, - param_attr=paddle.attr.ParamAttr(name='_source_language_embedding')) - src_forward = paddle.networks.simple_gru( - input=src_embedding, size=encoder_size) - src_backward = paddle.networks.simple_gru( - input=src_embedding, size=encoder_size, reverse=True) - encoded_vector = paddle.layer.concat(input=[src_forward, src_backward]) - - #### Decoder - with paddle.layer.mixed(size=decoder_size) as encoded_proj: - encoded_proj += paddle.layer.full_matrix_projection( - input=encoded_vector) - - backward_first = paddle.layer.first_seq(input=src_backward) - - with paddle.layer.mixed( - size=decoder_size, act=paddle.activation.Tanh()) as decoder_boot: - decoder_boot += paddle.layer.full_matrix_projection( - input=backward_first) - - def gru_decoder_with_attention(enc_vec, enc_proj, current_word): - - decoder_mem = paddle.layer.memory( - name='gru_decoder', size=decoder_size, boot_layer=decoder_boot) - - context = paddle.networks.simple_attention( - encoded_sequence=enc_vec, - encoded_proj=enc_proj, - decoder_state=decoder_mem) - - with paddle.layer.mixed(size=decoder_size * 3) as decoder_inputs: - decoder_inputs += paddle.layer.full_matrix_projection(input=context) - decoder_inputs += paddle.layer.full_matrix_projection( - input=current_word) - - gru_step = paddle.layer.gru_step( - name='gru_decoder', - input=decoder_inputs, - output_mem=decoder_mem, - size=decoder_size) - - with paddle.layer.mixed( - size=target_dict_dim, - bias_attr=True, - act=paddle.activation.Softmax()) as out: - out += paddle.layer.full_matrix_projection(input=gru_step) - return out - - decoder_group_name = "decoder_group" - group_input1 = paddle.layer.StaticInputV2(input=encoded_vector, is_seq=True) - group_input2 = paddle.layer.StaticInputV2(input=encoded_proj, is_seq=True) - group_inputs = [group_input1, group_input2] - - trg_embedding = paddle.layer.embedding( - input=paddle.layer.data( - name='target_language_word', - type=paddle.data_type.integer_value_sequence(target_dict_dim)), - size=word_vector_dim, - param_attr=paddle.attr.ParamAttr(name='_target_language_embedding')) - group_inputs.append(trg_embedding) - - # For decoder equipped with attention mechanism, in training, - # target embeding (the groudtruth) is the data input, - # while encoded source sequence is accessed to as an unbounded memory. - # Here, the StaticInput defines a read-only memory - # for the recurrent_group. - decoder = paddle.layer.recurrent_group( - name=decoder_group_name, - step=gru_decoder_with_attention, - input=group_inputs) - - lbl = paddle.layer.data( - name='target_language_next_word', - type=paddle.data_type.integer_value_sequence(target_dict_dim)) - cost = paddle.layer.classification_cost(input=decoder, label=lbl) - - return cost diff --git a/python/paddle/v2/dataset/wmt14.py b/python/paddle/v2/dataset/wmt14.py index 9904848b5d3..5a9dd4ca809 100644 --- a/python/paddle/v2/dataset/wmt14.py +++ b/python/paddle/v2/dataset/wmt14.py @@ -14,129 +14,102 @@ """ wmt14 dataset """ -import paddle.v2.dataset.common -import tarfile +import os import os.path -import itertools +import tarfile + +import paddle.v2.dataset.common +from wmt14_util import SeqToSeqDatasetCreater __all__ = ['train', 'test', 'build_dict'] URL_DEV_TEST = 'http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/data/dev+test.tgz' MD5_DEV_TEST = '7d7897317ddd8ba0ae5c5fa7248d3ff5' -URL_TRAIN = 'http://localhost:8000/train.tgz' -MD5_TRAIN = '72de99da2830ea5a3a2c4eb36092bbc7' - - -def word_count(f, word_freq=None): - add = paddle.v2.dataset.common.dict_add - if word_freq == None: - word_freq = {} - - for l in f: - for w in l.strip().split(): - add(word_freq, w) - add(word_freq, '') - add(word_freq, '') - - return word_freq - - -def get_word_dix(word_freq): - TYPO_FREQ = 50 - word_freq = filter(lambda x: x[1] > TYPO_FREQ, word_freq.items()) - word_freq_sorted = sorted(word_freq, key=lambda x: (-x[1], x[0])) - words, _ = list(zip(*word_freq_sorted)) - word_idx = dict(zip(words, xrange(len(words)))) - word_idx[''] = len(words) - return word_idx - - -def get_word_freq(train, dev): - word_freq = word_count(train, word_count(dev)) - if '' in word_freq: - # remove for now, since we will set it as last index - del word_freq[''] - return word_freq - - -def build_dict(): - base_dir = './wmt14-data' - train_en_filename = base_dir + '/train/train.en' - train_fr_filename = base_dir + '/train/train.fr' - dev_en_filename = base_dir + '/dev/ntst1213.en' - dev_fr_filename = base_dir + '/dev/ntst1213.fr' - - if not os.path.exists(train_en_filename) or not os.path.exists( - train_fr_filename): +URL_TRAIN = 'http://localhost:8989/wmt14.tgz' +MD5_TRAIN = '7373473f86016f1f48037c9c340a2d5b' + +START = "" +END = "" +UNK = "" +UNK_IDX = 2 + +DEFAULT_DATA_DIR = "./data" +ORIGIN_DATA_DIR = "wmt14" +INNER_DATA_DIR = "pre-wmt14" +SRC_DICT = INNER_DATA_DIR + "/src.dict" +TRG_DICT = INNER_DATA_DIR + "/trg.dict" +TRAIN_FILE = INNER_DATA_DIR + "/train/train" + + +def __process_data__(data_path, dict_size=None): + downloaded_data = os.path.join(data_path, ORIGIN_DATA_DIR) + if not os.path.exists(downloaded_data): + # 1. download and extract tgz. with tarfile.open( paddle.v2.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN)) as tf: - tf.extractall(base_dir) - - if not os.path.exists(dev_en_filename) or not os.path.exists( - dev_fr_filename): - with tarfile.open( - paddle.v2.dataset.common.download(URL_DEV_TEST, 'wmt14', - MD5_DEV_TEST)) as tf: - tf.extractall(base_dir) - - f_en = open(train_en_filename) - f_fr = open(train_fr_filename) - f_en_dev = open(dev_en_filename) - f_fr_dev = open(dev_fr_filename) - - word_freq_en = get_word_freq(f_en, f_en_dev) - word_freq_fr = get_word_freq(f_fr, f_fr_dev) - - f_en.close() - f_fr.close() - f_en_dev.close() - f_fr_dev.close() - - return get_word_dix(word_freq_en), get_word_dix(word_freq_fr) - - -def reader_creator(directory, path_en, path_fr, URL, MD5, dict_en, dict_fr): - def reader(): - if not os.path.exists(path_en) or not os.path.exists(path_fr): - with tarfile.open( - paddle.v2.dataset.common.download(URL, 'wmt14', MD5)) as tf: - tf.extractall(directory) - - f_en = open(path_en) - f_fr = open(path_fr) - UNK_en = dict_en[''] - UNK_fr = dict_fr[''] - - for en, fr in itertools.izip(f_en, f_fr): - src_ids = [dict_en.get(w, UNK_en) for w in en.strip().split()] - tar_ids = [ - dict_fr.get(w, UNK_fr) - for w in [''] + fr.strip().split() + [''] + tf.extractall(data_path) + + # 2. process data file to intermediate format. + processed_data = os.path.join(data_path, INNER_DATA_DIR) + if not os.path.exists(processed_data): + dict_size = dict_size or -1 + data_creator = SeqToSeqDatasetCreater(downloaded_data, processed_data) + data_creator.create_dataset(dict_size, mergeDict=False) + + +def __read_to_dict__(dict_path, count): + with open(dict_path, "r") as fin: + out_dict = dict() + for line_count, line in enumerate(fin): + if line_count <= count: + out_dict[line.strip()] = line_count + else: + break + return out_dict + + +def __reader__(file_name, src_dict, trg_dict): + with open(file_name, 'r') as f: + for line_count, line in enumerate(f): + line_split = line.strip().split('\t') + if len(line_split) != 2: + continue + src_seq = line_split[0] # one source sequence + src_words = src_seq.split() + src_ids = [ + src_dict.get(w, UNK_IDX) for w in [START] + src_words + [END] ] + trg_seq = line_split[1] # one target sequence + trg_words = trg_seq.split() + trg_ids = [trg_dict.get(w, UNK_IDX) for w in trg_words] + # remove sequence whose length > 80 in training mode - if len(src_ids) == 0 or len(tar_ids) <= 1 or len( - src_ids) > 80 or len(tar_ids) > 80: + if len(src_ids) > 80 or len(trg_ids) > 80: continue + trg_ids_next = trg_ids + [trg_dict[END]] + trg_ids = [trg_dict[START]] + trg_ids + + yield src_ids, trg_ids, trg_ids_next - yield src_ids, tar_ids[:-1], tar_ids[1:] - f_en.close() - f_fr.close() +def train(data_dir=None, dict_size=None): + data_dir = data_dir or DEFAULT_DATA_DIR + __process_data__(data_dir, dict_size) + src_lang_dict = os.path.join(data_dir, SRC_DICT) + trg_lang_dict = os.path.join(data_dir, TRG_DICT) + train_file_name = os.path.join(data_dir, TRAIN_FILE) - return reader + default_dict_size = len(open(src_lang_dict, "r").readlines()) + if dict_size > default_dict_size: + raise ValueError("dict_dim should not be larger then the " + "length of word dict") -def train(dict_en, dict_fr): - directory = './wmt14-data' - return reader_creator(directory, directory + '/train/train.en', - directory + '/train/train.fr', URL_TRAIN, MD5_TRAIN, - dict_en, dict_fr) + real_dict_dim = dict_size or default_dict_size + src_dict = __read_to_dict__(src_lang_dict, real_dict_dim) + trg_dict = __read_to_dict__(trg_lang_dict, real_dict_dim) -def test(dict_en, dict_fr): - directory = './wmt14-data' - return reader_creator(directory, directory + '/dev/ntst1213.en', - directory + '/dev/ntst1213.fr', URL_DEV_TEST, - MD5_DEV_TEST, dict_en, dict_fr) + return lambda: __reader__(train_file_name, src_dict, trg_dict) -- GitLab