提交 98522dcb 编写于 作者: Q qiaolongfei

optimizer wmt14 dataset

上级 a4bd4147
import os
import paddle.v2 as paddle import paddle.v2 as paddle
from seqToseq_net_v2 import seqToseq_net_v2
# Data Definiation.
# TODO:This code should be merged to dataset package.
data_dir = "./data/pre-wmt14"
src_lang_dict = os.path.join(data_dir, 'src.dict')
trg_lang_dict = os.path.join(data_dir, 'trg.dict')
source_dict_dim = len(open(src_lang_dict, "r").readlines())
target_dict_dim = len(open(trg_lang_dict, "r").readlines())
def read_to_dict(dict_path):
with open(dict_path, "r") as fin:
out_dict = {
line.strip(): line_count
for line_count, line in enumerate(fin)
}
return out_dict
src_dict = read_to_dict(src_lang_dict)
trg_dict = read_to_dict(trg_lang_dict)
train_list = os.path.join(data_dir, 'train.list')
test_list = os.path.join(data_dir, 'test.list')
UNK_IDX = 2
START = "<s>"
END = "<e>"
def seqToseq_net(source_dict_dim, target_dict_dim):
def _get_ids(s, dictionary): ### Network Architecture
words = s.strip().split() word_vector_dim = 512 # dimension of word vector
return [dictionary[START]] + \ decoder_size = 512 # dimension of hidden unit in GRU Decoder network
[dictionary.get(w, UNK_IDX) for w in words] + \ encoder_size = 512 # dimension of hidden unit in GRU Encoder network
[dictionary[END]]
#### Encoder
src_word_id = paddle.layer.data(
def train_reader(file_name): name='source_language_word',
def reader(): type=paddle.data_type.integer_value_sequence(source_dict_dim))
with open(file_name, 'r') as f: src_embedding = paddle.layer.embedding(
for line_count, line in enumerate(f): input=src_word_id,
line_split = line.strip().split('\t') size=word_vector_dim,
if len(line_split) != 2: param_attr=paddle.attr.ParamAttr(name='_source_language_embedding'))
continue src_forward = paddle.networks.simple_gru(
src_seq = line_split[0] # one source sequence input=src_embedding, size=encoder_size)
src_ids = _get_ids(src_seq, src_dict) src_backward = paddle.networks.simple_gru(
input=src_embedding, size=encoder_size, reverse=True)
trg_seq = line_split[1] # one target sequence encoded_vector = paddle.layer.concat(input=[src_forward, src_backward])
trg_words = trg_seq.split()
trg_ids = [trg_dict.get(w, UNK_IDX) for w in trg_words] #### Decoder
with paddle.layer.mixed(size=decoder_size) as encoded_proj:
# remove sequence whose length > 80 in training mode encoded_proj += paddle.layer.full_matrix_projection(
if len(src_ids) > 80 or len(trg_ids) > 80: input=encoded_vector)
continue
trg_ids_next = trg_ids + [trg_dict[END]] backward_first = paddle.layer.first_seq(input=src_backward)
trg_ids = [trg_dict[START]] + trg_ids
with paddle.layer.mixed(
yield src_ids, trg_ids, trg_ids_next size=decoder_size, act=paddle.activation.Tanh()) as decoder_boot:
decoder_boot += paddle.layer.full_matrix_projection(
return reader input=backward_first)
def gru_decoder_with_attention(enc_vec, enc_proj, current_word):
decoder_mem = paddle.layer.memory(
name='gru_decoder', size=decoder_size, boot_layer=decoder_boot)
context = paddle.networks.simple_attention(
encoded_sequence=enc_vec,
encoded_proj=enc_proj,
decoder_state=decoder_mem)
with paddle.layer.mixed(size=decoder_size * 3) as decoder_inputs:
decoder_inputs += paddle.layer.full_matrix_projection(input=context)
decoder_inputs += paddle.layer.full_matrix_projection(
input=current_word)
gru_step = paddle.layer.gru_step(
name='gru_decoder',
input=decoder_inputs,
output_mem=decoder_mem,
size=decoder_size)
with paddle.layer.mixed(
size=target_dict_dim,
bias_attr=True,
act=paddle.activation.Softmax()) as out:
out += paddle.layer.full_matrix_projection(input=gru_step)
return out
decoder_group_name = "decoder_group"
group_input1 = paddle.layer.StaticInputV2(input=encoded_vector, is_seq=True)
group_input2 = paddle.layer.StaticInputV2(input=encoded_proj, is_seq=True)
group_inputs = [group_input1, group_input2]
trg_embedding = paddle.layer.embedding(
input=paddle.layer.data(
name='target_language_word',
type=paddle.data_type.integer_value_sequence(target_dict_dim)),
size=word_vector_dim,
param_attr=paddle.attr.ParamAttr(name='_target_language_embedding'))
group_inputs.append(trg_embedding)
# For decoder equipped with attention mechanism, in training,
# target embeding (the groudtruth) is the data input,
# while encoded source sequence is accessed to as an unbounded memory.
# Here, the StaticInput defines a read-only memory
# for the recurrent_group.
decoder = paddle.layer.recurrent_group(
name=decoder_group_name,
step=gru_decoder_with_attention,
input=group_inputs)
lbl = paddle.layer.data(
name='target_language_next_word',
type=paddle.data_type.integer_value_sequence(target_dict_dim))
cost = paddle.layer.classification_cost(input=decoder, label=lbl)
return cost
def main(): def main():
paddle.init(use_gpu=False, trainer_count=1) paddle.init(use_gpu=False, trainer_count=1)
# source and target dict dim.
dict_size = 30000
source_dict_dim = target_dict_dim = dict_size
# define network topology # define network topology
cost = seqToseq_net_v2(source_dict_dim, target_dict_dim) cost = seqToseq_net(source_dict_dim, target_dict_dim)
parameters = paddle.parameters.create(cost) parameters = paddle.parameters.create(cost)
# define optimize method and trainer # define optimize method and trainer
...@@ -85,10 +115,9 @@ def main(): ...@@ -85,10 +115,9 @@ def main():
'target_language_word': 1, 'target_language_word': 1,
'target_language_next_word': 2 'target_language_next_word': 2
} }
wmt14_reader = paddle.reader.batched( wmt14_reader = paddle.reader.batched(
paddle.reader.shuffle( paddle.reader.shuffle(
train_reader("data/pre-wmt14/train/train"), buf_size=8192), paddle.dataset.wmt14.train(dict_size=dict_size), buf_size=8192),
batch_size=5) batch_size=5)
# define event_handler callback # define event_handler callback
......
import paddle.v2 as paddle
def seqToseq_net_v2(source_dict_dim, target_dict_dim):
### Network Architecture
word_vector_dim = 512 # dimension of word vector
decoder_size = 512 # dimension of hidden unit in GRU Decoder network
encoder_size = 512 # dimension of hidden unit in GRU Encoder network
#### Encoder
src_word_id = paddle.layer.data(
name='source_language_word',
type=paddle.data_type.integer_value_sequence(source_dict_dim))
src_embedding = paddle.layer.embedding(
input=src_word_id,
size=word_vector_dim,
param_attr=paddle.attr.ParamAttr(name='_source_language_embedding'))
src_forward = paddle.networks.simple_gru(
input=src_embedding, size=encoder_size)
src_backward = paddle.networks.simple_gru(
input=src_embedding, size=encoder_size, reverse=True)
encoded_vector = paddle.layer.concat(input=[src_forward, src_backward])
#### Decoder
with paddle.layer.mixed(size=decoder_size) as encoded_proj:
encoded_proj += paddle.layer.full_matrix_projection(
input=encoded_vector)
backward_first = paddle.layer.first_seq(input=src_backward)
with paddle.layer.mixed(
size=decoder_size, act=paddle.activation.Tanh()) as decoder_boot:
decoder_boot += paddle.layer.full_matrix_projection(
input=backward_first)
def gru_decoder_with_attention(enc_vec, enc_proj, current_word):
decoder_mem = paddle.layer.memory(
name='gru_decoder', size=decoder_size, boot_layer=decoder_boot)
context = paddle.networks.simple_attention(
encoded_sequence=enc_vec,
encoded_proj=enc_proj,
decoder_state=decoder_mem)
with paddle.layer.mixed(size=decoder_size * 3) as decoder_inputs:
decoder_inputs += paddle.layer.full_matrix_projection(input=context)
decoder_inputs += paddle.layer.full_matrix_projection(
input=current_word)
gru_step = paddle.layer.gru_step(
name='gru_decoder',
input=decoder_inputs,
output_mem=decoder_mem,
size=decoder_size)
with paddle.layer.mixed(
size=target_dict_dim,
bias_attr=True,
act=paddle.activation.Softmax()) as out:
out += paddle.layer.full_matrix_projection(input=gru_step)
return out
decoder_group_name = "decoder_group"
group_input1 = paddle.layer.StaticInputV2(input=encoded_vector, is_seq=True)
group_input2 = paddle.layer.StaticInputV2(input=encoded_proj, is_seq=True)
group_inputs = [group_input1, group_input2]
trg_embedding = paddle.layer.embedding(
input=paddle.layer.data(
name='target_language_word',
type=paddle.data_type.integer_value_sequence(target_dict_dim)),
size=word_vector_dim,
param_attr=paddle.attr.ParamAttr(name='_target_language_embedding'))
group_inputs.append(trg_embedding)
# For decoder equipped with attention mechanism, in training,
# target embeding (the groudtruth) is the data input,
# while encoded source sequence is accessed to as an unbounded memory.
# Here, the StaticInput defines a read-only memory
# for the recurrent_group.
decoder = paddle.layer.recurrent_group(
name=decoder_group_name,
step=gru_decoder_with_attention,
input=group_inputs)
lbl = paddle.layer.data(
name='target_language_next_word',
type=paddle.data_type.integer_value_sequence(target_dict_dim))
cost = paddle.layer.classification_cost(input=decoder, label=lbl)
return cost
...@@ -14,129 +14,102 @@ ...@@ -14,129 +14,102 @@
""" """
wmt14 dataset wmt14 dataset
""" """
import paddle.v2.dataset.common import os
import tarfile
import os.path import os.path
import itertools import tarfile
import paddle.v2.dataset.common
from wmt14_util import SeqToSeqDatasetCreater
__all__ = ['train', 'test', 'build_dict'] __all__ = ['train', 'test', 'build_dict']
URL_DEV_TEST = 'http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/data/dev+test.tgz' URL_DEV_TEST = 'http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/data/dev+test.tgz'
MD5_DEV_TEST = '7d7897317ddd8ba0ae5c5fa7248d3ff5' MD5_DEV_TEST = '7d7897317ddd8ba0ae5c5fa7248d3ff5'
URL_TRAIN = 'http://localhost:8000/train.tgz' URL_TRAIN = 'http://localhost:8989/wmt14.tgz'
MD5_TRAIN = '72de99da2830ea5a3a2c4eb36092bbc7' MD5_TRAIN = '7373473f86016f1f48037c9c340a2d5b'
START = "<s>"
def word_count(f, word_freq=None): END = "<e>"
add = paddle.v2.dataset.common.dict_add UNK = "<unk>"
if word_freq == None: UNK_IDX = 2
word_freq = {}
DEFAULT_DATA_DIR = "./data"
for l in f: ORIGIN_DATA_DIR = "wmt14"
for w in l.strip().split(): INNER_DATA_DIR = "pre-wmt14"
add(word_freq, w) SRC_DICT = INNER_DATA_DIR + "/src.dict"
add(word_freq, '<s>') TRG_DICT = INNER_DATA_DIR + "/trg.dict"
add(word_freq, '<e>') TRAIN_FILE = INNER_DATA_DIR + "/train/train"
return word_freq
def __process_data__(data_path, dict_size=None):
downloaded_data = os.path.join(data_path, ORIGIN_DATA_DIR)
def get_word_dix(word_freq): if not os.path.exists(downloaded_data):
TYPO_FREQ = 50 # 1. download and extract tgz.
word_freq = filter(lambda x: x[1] > TYPO_FREQ, word_freq.items())
word_freq_sorted = sorted(word_freq, key=lambda x: (-x[1], x[0]))
words, _ = list(zip(*word_freq_sorted))
word_idx = dict(zip(words, xrange(len(words))))
word_idx['<unk>'] = len(words)
return word_idx
def get_word_freq(train, dev):
word_freq = word_count(train, word_count(dev))
if '<unk>' in word_freq:
# remove <unk> for now, since we will set it as last index
del word_freq['<unk>']
return word_freq
def build_dict():
base_dir = './wmt14-data'
train_en_filename = base_dir + '/train/train.en'
train_fr_filename = base_dir + '/train/train.fr'
dev_en_filename = base_dir + '/dev/ntst1213.en'
dev_fr_filename = base_dir + '/dev/ntst1213.fr'
if not os.path.exists(train_en_filename) or not os.path.exists(
train_fr_filename):
with tarfile.open( with tarfile.open(
paddle.v2.dataset.common.download(URL_TRAIN, 'wmt14', paddle.v2.dataset.common.download(URL_TRAIN, 'wmt14',
MD5_TRAIN)) as tf: MD5_TRAIN)) as tf:
tf.extractall(base_dir) tf.extractall(data_path)
if not os.path.exists(dev_en_filename) or not os.path.exists( # 2. process data file to intermediate format.
dev_fr_filename): processed_data = os.path.join(data_path, INNER_DATA_DIR)
with tarfile.open( if not os.path.exists(processed_data):
paddle.v2.dataset.common.download(URL_DEV_TEST, 'wmt14', dict_size = dict_size or -1
MD5_DEV_TEST)) as tf: data_creator = SeqToSeqDatasetCreater(downloaded_data, processed_data)
tf.extractall(base_dir) data_creator.create_dataset(dict_size, mergeDict=False)
f_en = open(train_en_filename)
f_fr = open(train_fr_filename) def __read_to_dict__(dict_path, count):
f_en_dev = open(dev_en_filename) with open(dict_path, "r") as fin:
f_fr_dev = open(dev_fr_filename) out_dict = dict()
for line_count, line in enumerate(fin):
word_freq_en = get_word_freq(f_en, f_en_dev) if line_count <= count:
word_freq_fr = get_word_freq(f_fr, f_fr_dev) out_dict[line.strip()] = line_count
else:
f_en.close() break
f_fr.close() return out_dict
f_en_dev.close()
f_fr_dev.close()
def __reader__(file_name, src_dict, trg_dict):
return get_word_dix(word_freq_en), get_word_dix(word_freq_fr) with open(file_name, 'r') as f:
for line_count, line in enumerate(f):
line_split = line.strip().split('\t')
def reader_creator(directory, path_en, path_fr, URL, MD5, dict_en, dict_fr): if len(line_split) != 2:
def reader(): continue
if not os.path.exists(path_en) or not os.path.exists(path_fr): src_seq = line_split[0] # one source sequence
with tarfile.open( src_words = src_seq.split()
paddle.v2.dataset.common.download(URL, 'wmt14', MD5)) as tf: src_ids = [
tf.extractall(directory) src_dict.get(w, UNK_IDX) for w in [START] + src_words + [END]
f_en = open(path_en)
f_fr = open(path_fr)
UNK_en = dict_en['<unk>']
UNK_fr = dict_fr['<unk>']
for en, fr in itertools.izip(f_en, f_fr):
src_ids = [dict_en.get(w, UNK_en) for w in en.strip().split()]
tar_ids = [
dict_fr.get(w, UNK_fr)
for w in ['<s>'] + fr.strip().split() + ['<e>']
] ]
trg_seq = line_split[1] # one target sequence
trg_words = trg_seq.split()
trg_ids = [trg_dict.get(w, UNK_IDX) for w in trg_words]
# remove sequence whose length > 80 in training mode # remove sequence whose length > 80 in training mode
if len(src_ids) == 0 or len(tar_ids) <= 1 or len( if len(src_ids) > 80 or len(trg_ids) > 80:
src_ids) > 80 or len(tar_ids) > 80:
continue continue
trg_ids_next = trg_ids + [trg_dict[END]]
trg_ids = [trg_dict[START]] + trg_ids
yield src_ids, trg_ids, trg_ids_next
yield src_ids, tar_ids[:-1], tar_ids[1:]
f_en.close() def train(data_dir=None, dict_size=None):
f_fr.close() data_dir = data_dir or DEFAULT_DATA_DIR
__process_data__(data_dir, dict_size)
src_lang_dict = os.path.join(data_dir, SRC_DICT)
trg_lang_dict = os.path.join(data_dir, TRG_DICT)
train_file_name = os.path.join(data_dir, TRAIN_FILE)
return reader default_dict_size = len(open(src_lang_dict, "r").readlines())
if dict_size > default_dict_size:
raise ValueError("dict_dim should not be larger then the "
"length of word dict")
def train(dict_en, dict_fr): real_dict_dim = dict_size or default_dict_size
directory = './wmt14-data'
return reader_creator(directory, directory + '/train/train.en',
directory + '/train/train.fr', URL_TRAIN, MD5_TRAIN,
dict_en, dict_fr)
src_dict = __read_to_dict__(src_lang_dict, real_dict_dim)
trg_dict = __read_to_dict__(trg_lang_dict, real_dict_dim)
def test(dict_en, dict_fr): return lambda: __reader__(train_file_name, src_dict, trg_dict)
directory = './wmt14-data'
return reader_creator(directory, directory + '/dev/ntst1213.en',
directory + '/dev/ntst1213.fr', URL_DEV_TEST,
MD5_DEV_TEST, dict_en, dict_fr)
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册