提交 5905d0e8 编写于 作者: Y Yu Yang

Merge branch 'develop' of github.com:baidu/Paddle into feature/better_infer_interface

......@@ -13,9 +13,10 @@
# limitations under the License
import sys
import paddle.v2 as paddle
from api_v2_vgg import vgg_bn_drop
from api_v2_resnet import resnet_cifar10
def main():
......@@ -23,16 +24,16 @@ def main():
classdim = 10
# PaddlePaddle init
paddle.init(use_gpu=True, trainer_count=1)
paddle.init(use_gpu=False, trainer_count=1)
image = paddle.layer.data(
name="image", type=paddle.data_type.dense_vector(datadim))
# Add neural network config
# option 1. resnet
net = resnet_cifar10(image, depth=32)
# net = resnet_cifar10(image, depth=32)
# option 2. vgg
# net = vgg_bn_drop(image)
net = vgg_bn_drop(image)
out = paddle.layer.fc(input=net,
size=classdim,
......@@ -68,8 +69,8 @@ def main():
result = trainer.test(
reader=paddle.batch(
paddle.dataset.cifar.test10(), batch_size=128),
reader_dict={'image': 0,
'label': 1})
feeding={'image': 0,
'label': 1})
print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics)
# Create trainer
......@@ -83,8 +84,8 @@ def main():
batch_size=128),
num_passes=5,
event_handler=event_handler,
reader_dict={'image': 0,
'label': 1})
feeding={'image': 0,
'label': 1})
if __name__ == '__main__':
......
......@@ -30,26 +30,26 @@ def main():
def event_handler(event):
if isinstance(event, paddle.event.EndIteration):
if event.batch_id % 100 == 0:
print "Pass %d, Batch %d, Cost %f, %s" % (
event.pass_id, event.batch_id, event.cost, event.metrics)
print "Pass %d, Batch %d, Cost %f" % (
event.pass_id, event.batch_id, event.cost)
if isinstance(event, paddle.event.EndPass):
result = trainer.test(
reader=paddle.reader.batched(
uci_housing.test(), batch_size=2),
reader_dict={'x': 0,
if (event.pass_id + 1) % 10 == 0:
result = trainer.test(
reader=paddle.batch(
uci_housing.test(), batch_size=2),
feeding={'x': 0,
'y': 1})
if event.pass_id % 10 == 0:
print "Test %d, %s" % (event.pass_id, result.metrics)
print "Test %d, %.2f" % (event.pass_id, result.cost)
# training
trainer.train(
reader=paddle.reader.batched(
reader=paddle.batch(
paddle.reader.shuffle(
uci_housing.train(), buf_size=500),
batch_size=2),
reader_dict={'x': 0,
'y': 1},
feeding={'x': 0,
'y': 1},
event_handler=event_handler,
num_passes=30)
......
......@@ -5,3 +5,6 @@ plot.png
train.log
*pyc
.ipynb_checkpoints
params.pkl
params.tar
params.tar.gz
import paddle.v2 as paddle
import gzip
def softmax_regression(img):
......@@ -71,7 +72,11 @@ def main():
cost = paddle.layer.classification_cost(input=predict, label=label)
parameters = paddle.parameters.create(cost)
try:
with gzip.open('params.tar.gz', 'r') as f:
parameters = paddle.parameters.Parameters.from_tar(f)
except IOError:
parameters = paddle.parameters.create(cost)
optimizer = paddle.optimizer.Momentum(
learning_rate=0.1 / 128.0,
......@@ -86,10 +91,18 @@ def main():
def event_handler(event):
if isinstance(event, paddle.event.EndIteration):
if event.batch_id % 100 == 0:
print "Pass %d, Batch %d, Cost %f, %s" % (
event.pass_id, event.batch_id, event.cost, event.metrics)
if isinstance(event, paddle.event.EndPass):
if event.batch_id % 1000 == 0:
result = trainer.test(reader=paddle.batch(
paddle.dataset.mnist.test(), batch_size=256))
print "Pass %d, Batch %d, Cost %f, %s, Testing metrics %s" % (
event.pass_id, event.batch_id, event.cost, event.metrics,
result.metrics)
with gzip.open('params.tar.gz', 'w') as f:
parameters.to_tar(f)
elif isinstance(event, paddle.event.EndPass):
result = trainer.test(reader=paddle.batch(
paddle.dataset.mnist.test(), batch_size=128))
print "Test with Pass %d, Cost %f, %s\n" % (
......
......@@ -163,11 +163,11 @@ def main():
update_equation=optimizer)
parameters.set('emb', load_parameter(conll05.get_embedding(), 44068, 32))
trn_reader = paddle.reader.batched(
trn_reader = paddle.batch(
paddle.reader.shuffle(
conll05.test(), buf_size=8192), batch_size=10)
reader_dict = {
feeding = {
'word_data': 0,
'ctx_n2_data': 1,
'ctx_n1_data': 2,
......@@ -183,7 +183,7 @@ def main():
reader=trn_reader,
event_handler=event_handler,
num_passes=10000,
reader_dict=reader_dict)
feeding=feeding)
if __name__ == '__main__':
......
......@@ -18,11 +18,7 @@ from paddle.trainer_config_helpers.poolings import MaxPooling
import paddle.v2 as paddle
def convolution_net(input_dim,
class_dim=2,
emb_dim=128,
hid_dim=128,
is_predict=False):
def convolution_net(input_dim, class_dim=2, emb_dim=128, hid_dim=128):
data = paddle.layer.data("word",
paddle.data_type.integer_value_sequence(input_dim))
emb = paddle.layer.embedding(input=data, size=emb_dim)
......@@ -42,8 +38,7 @@ def stacked_lstm_net(input_dim,
class_dim=2,
emb_dim=128,
hid_dim=512,
stacked_num=3,
is_predict=False):
stacked_num=3):
"""
A Wrapper for sentiment classification task.
This network uses bi-directional recurrent network,
......@@ -110,7 +105,7 @@ def stacked_lstm_net(input_dim,
if __name__ == '__main__':
# init
paddle.init(use_gpu=True, trainer_count=4)
paddle.init(use_gpu=False, trainer_count=4)
# network config
print 'load dictionary...'
......@@ -143,11 +138,11 @@ if __name__ == '__main__':
sys.stdout.flush()
if isinstance(event, paddle.event.EndPass):
result = trainer.test(
reader=paddle.reader.batched(
reader=paddle.batch(
lambda: paddle.dataset.imdb.test(word_dict),
batch_size=128),
reader_dict={'word': 0,
'label': 1})
feeding={'word': 0,
'label': 1})
print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics)
# create trainer
......@@ -156,11 +151,11 @@ if __name__ == '__main__':
update_equation=adam_optimizer)
trainer.train(
reader=paddle.reader.batched(
reader=paddle.batch(
paddle.reader.shuffle(
lambda: paddle.dataset.imdb.train(word_dict), buf_size=1000),
batch_size=100),
event_handler=event_handler,
reader_dict={'word': 0,
'label': 1},
feeding={'word': 0,
'label': 1},
num_passes=10)
import os
import paddle.v2 as paddle
from seqToseq_net_v2 import seqToseq_net_v2
# Data Definiation.
# TODO:This code should be merged to dataset package.
data_dir = "./data/pre-wmt14"
src_lang_dict = os.path.join(data_dir, 'src.dict')
trg_lang_dict = os.path.join(data_dir, 'trg.dict')
source_dict_dim = len(open(src_lang_dict, "r").readlines())
target_dict_dim = len(open(trg_lang_dict, "r").readlines())
def read_to_dict(dict_path):
with open(dict_path, "r") as fin:
out_dict = {
line.strip(): line_count
for line_count, line in enumerate(fin)
}
return out_dict
src_dict = read_to_dict(src_lang_dict)
trg_dict = read_to_dict(trg_lang_dict)
train_list = os.path.join(data_dir, 'train.list')
test_list = os.path.join(data_dir, 'test.list')
UNK_IDX = 2
START = "<s>"
END = "<e>"
def _get_ids(s, dictionary):
words = s.strip().split()
return [dictionary[START]] + \
[dictionary.get(w, UNK_IDX) for w in words] + \
[dictionary[END]]
def train_reader(file_name):
def reader():
with open(file_name, 'r') as f:
for line_count, line in enumerate(f):
line_split = line.strip().split('\t')
if len(line_split) != 2:
continue
src_seq = line_split[0] # one source sequence
src_ids = _get_ids(src_seq, src_dict)
trg_seq = line_split[1] # one target sequence
trg_words = trg_seq.split()
trg_ids = [trg_dict.get(w, UNK_IDX) for w in trg_words]
# remove sequence whose length > 80 in training mode
if len(src_ids) > 80 or len(trg_ids) > 80:
continue
trg_ids_next = trg_ids + [trg_dict[END]]
trg_ids = [trg_dict[START]] + trg_ids
yield src_ids, trg_ids, trg_ids_next
return reader
def seqToseq_net(source_dict_dim, target_dict_dim):
### Network Architecture
word_vector_dim = 512 # dimension of word vector
decoder_size = 512 # dimension of hidden unit in GRU Decoder network
encoder_size = 512 # dimension of hidden unit in GRU Encoder network
#### Encoder
src_word_id = paddle.layer.data(
name='source_language_word',
type=paddle.data_type.integer_value_sequence(source_dict_dim))
src_embedding = paddle.layer.embedding(
input=src_word_id,
size=word_vector_dim,
param_attr=paddle.attr.ParamAttr(name='_source_language_embedding'))
src_forward = paddle.networks.simple_gru(
input=src_embedding, size=encoder_size)
src_backward = paddle.networks.simple_gru(
input=src_embedding, size=encoder_size, reverse=True)
encoded_vector = paddle.layer.concat(input=[src_forward, src_backward])
#### Decoder
with paddle.layer.mixed(size=decoder_size) as encoded_proj:
encoded_proj += paddle.layer.full_matrix_projection(
input=encoded_vector)
backward_first = paddle.layer.first_seq(input=src_backward)
with paddle.layer.mixed(
size=decoder_size, act=paddle.activation.Tanh()) as decoder_boot:
decoder_boot += paddle.layer.full_matrix_projection(
input=backward_first)
def gru_decoder_with_attention(enc_vec, enc_proj, current_word):
decoder_mem = paddle.layer.memory(
name='gru_decoder', size=decoder_size, boot_layer=decoder_boot)
context = paddle.networks.simple_attention(
encoded_sequence=enc_vec,
encoded_proj=enc_proj,
decoder_state=decoder_mem)
with paddle.layer.mixed(size=decoder_size * 3) as decoder_inputs:
decoder_inputs += paddle.layer.full_matrix_projection(input=context)
decoder_inputs += paddle.layer.full_matrix_projection(
input=current_word)
gru_step = paddle.layer.gru_step(
name='gru_decoder',
input=decoder_inputs,
output_mem=decoder_mem,
size=decoder_size)
with paddle.layer.mixed(
size=target_dict_dim,
bias_attr=True,
act=paddle.activation.Softmax()) as out:
out += paddle.layer.full_matrix_projection(input=gru_step)
return out
decoder_group_name = "decoder_group"
group_input1 = paddle.layer.StaticInputV2(input=encoded_vector, is_seq=True)
group_input2 = paddle.layer.StaticInputV2(input=encoded_proj, is_seq=True)
group_inputs = [group_input1, group_input2]
trg_embedding = paddle.layer.embedding(
input=paddle.layer.data(
name='target_language_word',
type=paddle.data_type.integer_value_sequence(target_dict_dim)),
size=word_vector_dim,
param_attr=paddle.attr.ParamAttr(name='_target_language_embedding'))
group_inputs.append(trg_embedding)
# For decoder equipped with attention mechanism, in training,
# target embeding (the groudtruth) is the data input,
# while encoded source sequence is accessed to as an unbounded memory.
# Here, the StaticInput defines a read-only memory
# for the recurrent_group.
decoder = paddle.layer.recurrent_group(
name=decoder_group_name,
step=gru_decoder_with_attention,
input=group_inputs)
lbl = paddle.layer.data(
name='target_language_next_word',
type=paddle.data_type.integer_value_sequence(target_dict_dim))
cost = paddle.layer.classification_cost(input=decoder, label=lbl)
return cost
def main():
paddle.init(use_gpu=False, trainer_count=1)
# source and target dict dim.
dict_size = 30000
source_dict_dim = target_dict_dim = dict_size
# define network topology
cost = seqToseq_net_v2(source_dict_dim, target_dict_dim)
cost = seqToseq_net(source_dict_dim, target_dict_dim)
parameters = paddle.parameters.create(cost)
# define optimize method and trainer
......@@ -80,15 +110,15 @@ def main():
update_equation=optimizer)
# define data reader
reader_dict = {
feeding = {
'source_language_word': 0,
'target_language_word': 1,
'target_language_next_word': 2
}
wmt14_reader = paddle.reader.batched(
wmt14_reader = paddle.batch(
paddle.reader.shuffle(
train_reader("data/pre-wmt14/train/train"), buf_size=8192),
paddle.dataset.wmt14.train(dict_size=dict_size), buf_size=8192),
batch_size=5)
# define event_handler callback
......@@ -103,7 +133,7 @@ def main():
reader=wmt14_reader,
event_handler=event_handler,
num_passes=10000,
reader_dict=reader_dict)
feeding=feeding)
if __name__ == '__main__':
......
import paddle.v2 as paddle
def seqToseq_net_v2(source_dict_dim, target_dict_dim):
### Network Architecture
word_vector_dim = 512 # dimension of word vector
decoder_size = 512 # dimension of hidden unit in GRU Decoder network
encoder_size = 512 # dimension of hidden unit in GRU Encoder network
#### Encoder
src_word_id = paddle.layer.data(
name='source_language_word',
type=paddle.data_type.integer_value_sequence(source_dict_dim))
src_embedding = paddle.layer.embedding(
input=src_word_id,
size=word_vector_dim,
param_attr=paddle.attr.ParamAttr(name='_source_language_embedding'))
src_forward = paddle.networks.simple_gru(
input=src_embedding, size=encoder_size)
src_backward = paddle.networks.simple_gru(
input=src_embedding, size=encoder_size, reverse=True)
encoded_vector = paddle.layer.concat(input=[src_forward, src_backward])
#### Decoder
with paddle.layer.mixed(size=decoder_size) as encoded_proj:
encoded_proj += paddle.layer.full_matrix_projection(
input=encoded_vector)
backward_first = paddle.layer.first_seq(input=src_backward)
with paddle.layer.mixed(
size=decoder_size, act=paddle.activation.Tanh()) as decoder_boot:
decoder_boot += paddle.layer.full_matrix_projection(
input=backward_first)
def gru_decoder_with_attention(enc_vec, enc_proj, current_word):
decoder_mem = paddle.layer.memory(
name='gru_decoder', size=decoder_size, boot_layer=decoder_boot)
context = paddle.networks.simple_attention(
encoded_sequence=enc_vec,
encoded_proj=enc_proj,
decoder_state=decoder_mem)
with paddle.layer.mixed(size=decoder_size * 3) as decoder_inputs:
decoder_inputs += paddle.layer.full_matrix_projection(input=context)
decoder_inputs += paddle.layer.full_matrix_projection(
input=current_word)
gru_step = paddle.layer.gru_step(
name='gru_decoder',
input=decoder_inputs,
output_mem=decoder_mem,
size=decoder_size)
with paddle.layer.mixed(
size=target_dict_dim,
bias_attr=True,
act=paddle.activation.Softmax()) as out:
out += paddle.layer.full_matrix_projection(input=gru_step)
return out
decoder_group_name = "decoder_group"
group_input1 = paddle.layer.StaticInputV2(input=encoded_vector, is_seq=True)
group_input2 = paddle.layer.StaticInputV2(input=encoded_proj, is_seq=True)
group_inputs = [group_input1, group_input2]
trg_embedding = paddle.layer.embedding(
input=paddle.layer.data(
name='target_language_word',
type=paddle.data_type.integer_value_sequence(target_dict_dim)),
size=word_vector_dim,
param_attr=paddle.attr.ParamAttr(name='_target_language_embedding'))
group_inputs.append(trg_embedding)
# For decoder equipped with attention mechanism, in training,
# target embeding (the groudtruth) is the data input,
# while encoded source sequence is accessed to as an unbounded memory.
# Here, the StaticInput defines a read-only memory
# for the recurrent_group.
decoder = paddle.layer.recurrent_group(
name=decoder_group_name,
step=gru_decoder_with_attention,
input=group_inputs)
lbl = paddle.layer.data(
name='target_language_next_word',
type=paddle.data_type.integer_value_sequence(target_dict_dim))
cost = paddle.layer.classification_cost(input=decoder, label=lbl)
return cost
......@@ -39,6 +39,7 @@ register_unary_math_op('abs', act.AbsActivation())
register_unary_math_op('sigmoid', act.SigmoidActivation())
register_unary_math_op('tanh', act.TanhActivation())
register_unary_math_op('square', act.SquareActivation())
register_unary_math_op('relu', act.ReluActivation())
def add(layeroutput, other):
......
......@@ -7,8 +7,9 @@ x = layer_math.exp(x)
x = layer_math.log(x)
x = layer_math.abs(x)
x = layer_math.sigmoid(x)
x = layer_math.tanh(x)
x = layer_math.square(x)
x = layer_math.square(x)
x = layer_math.relu(x)
y = 1 + x
y = y + 1
y = x + y
......
......@@ -65,13 +65,28 @@ layers {
}
}
}
layers {
name: "__tanh_0__"
type: "mixed"
size: 100
active_type: "tanh"
inputs {
input_layer_name: "__sigmoid_0__"
proj_conf {
type: "identity"
name: "___tanh_0__.w0"
input_size: 100
output_size: 100
}
}
}
layers {
name: "__square_0__"
type: "mixed"
size: 100
active_type: "square"
inputs {
input_layer_name: "__sigmoid_0__"
input_layer_name: "__tanh_0__"
proj_conf {
type: "identity"
name: "___square_0__.w0"
......@@ -81,15 +96,15 @@ layers {
}
}
layers {
name: "__square_1__"
name: "__relu_0__"
type: "mixed"
size: 100
active_type: "square"
active_type: "relu"
inputs {
input_layer_name: "__square_0__"
proj_conf {
type: "identity"
name: "___square_1__.w0"
name: "___relu_0__.w0"
input_size: 100
output_size: 100
}
......@@ -101,7 +116,7 @@ layers {
size: 100
active_type: ""
inputs {
input_layer_name: "__square_1__"
input_layer_name: "__relu_0__"
}
slope: 1.0
intercept: 1
......@@ -123,7 +138,7 @@ layers {
size: 100
active_type: ""
inputs {
input_layer_name: "__square_1__"
input_layer_name: "__relu_0__"
proj_conf {
type: "identity"
name: "___mixed_0__.w0"
......@@ -147,7 +162,7 @@ layers {
size: 100
active_type: ""
inputs {
input_layer_name: "__square_1__"
input_layer_name: "__relu_0__"
}
slope: -1.0
intercept: 0.0
......@@ -339,8 +354,9 @@ sub_models {
layer_names: "__log_0__"
layer_names: "__abs_0__"
layer_names: "__sigmoid_0__"
layer_names: "__tanh_0__"
layer_names: "__square_0__"
layer_names: "__square_1__"
layer_names: "__relu_0__"
layer_names: "__slope_intercept_layer_0__"
layer_names: "__slope_intercept_layer_1__"
layer_names: "__mixed_0__"
......
......@@ -14,11 +14,18 @@
from py_paddle import DataProviderConverter
import data_type
import paddle.trainer.PyDataProvider2 as pydp2
__all__ = ['DataFeeder']
def default_feeding_map(data_types):
reader_dict = dict()
for i, tp in enumerate(data_types):
reader_dict[tp[0]] = i
return reader_dict
class DataFeeder(DataProviderConverter):
"""
DataFeeder converts the data returned by paddle.reader into a data structure
......@@ -60,16 +67,21 @@ class DataFeeder(DataProviderConverter):
:type data_types: list
:param reader_dict: A dictionary to specify the position of each data
in the input data.
:type reader_dict: dict
:type feeding: dict
"""
def __init__(self, data_types, reader_dict):
def __init__(self, data_types, feeding=None):
self.input_names = []
input_types = []
self.reader_dict = reader_dict
if feeding is None:
feeding = default_feeding_map(data_types)
self.feeding = feeding
for each in data_types:
self.input_names.append(each[0])
assert isinstance(each[1], data_type.InputType)
if not isinstance(each[1], pydp2.InputType):
raise TypeError("second item in each data_type should be an "
"InputType")
input_types.append(each[1])
DataProviderConverter.__init__(self, input_types)
......@@ -90,7 +102,7 @@ class DataFeeder(DataProviderConverter):
for each in data:
reorder = []
for name in self.input_names:
reorder.append(each[self.reader_dict[name]])
reorder.append(each[self.feeding[name]])
retv.append(reorder)
return retv
......
......@@ -14,129 +14,92 @@
"""
wmt14 dataset
"""
import paddle.v2.dataset.common
import tarfile
import os.path
import itertools
import paddle.v2.dataset.common
__all__ = ['train', 'test', 'build_dict']
URL_DEV_TEST = 'http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/data/dev+test.tgz'
MD5_DEV_TEST = '7d7897317ddd8ba0ae5c5fa7248d3ff5'
URL_TRAIN = 'http://localhost:8000/train.tgz'
MD5_TRAIN = '72de99da2830ea5a3a2c4eb36092bbc7'
def word_count(f, word_freq=None):
add = paddle.v2.dataset.common.dict_add
if word_freq == None:
word_freq = {}
for l in f:
for w in l.strip().split():
add(word_freq, w)
add(word_freq, '<s>')
add(word_freq, '<e>')
return word_freq
def get_word_dix(word_freq):
TYPO_FREQ = 50
word_freq = filter(lambda x: x[1] > TYPO_FREQ, word_freq.items())
word_freq_sorted = sorted(word_freq, key=lambda x: (-x[1], x[0]))
words, _ = list(zip(*word_freq_sorted))
word_idx = dict(zip(words, xrange(len(words))))
word_idx['<unk>'] = len(words)
return word_idx
def get_word_freq(train, dev):
word_freq = word_count(train, word_count(dev))
if '<unk>' in word_freq:
# remove <unk> for now, since we will set it as last index
del word_freq['<unk>']
return word_freq
def build_dict():
base_dir = './wmt14-data'
train_en_filename = base_dir + '/train/train.en'
train_fr_filename = base_dir + '/train/train.fr'
dev_en_filename = base_dir + '/dev/ntst1213.en'
dev_fr_filename = base_dir + '/dev/ntst1213.fr'
if not os.path.exists(train_en_filename) or not os.path.exists(
train_fr_filename):
with tarfile.open(
paddle.v2.dataset.common.download(URL_TRAIN, 'wmt14',
MD5_TRAIN)) as tf:
tf.extractall(base_dir)
if not os.path.exists(dev_en_filename) or not os.path.exists(
dev_fr_filename):
with tarfile.open(
paddle.v2.dataset.common.download(URL_DEV_TEST, 'wmt14',
MD5_DEV_TEST)) as tf:
tf.extractall(base_dir)
f_en = open(train_en_filename)
f_fr = open(train_fr_filename)
f_en_dev = open(dev_en_filename)
f_fr_dev = open(dev_fr_filename)
word_freq_en = get_word_freq(f_en, f_en_dev)
word_freq_fr = get_word_freq(f_fr, f_fr_dev)
f_en.close()
f_fr.close()
f_en_dev.close()
f_fr_dev.close()
return get_word_dix(word_freq_en), get_word_dix(word_freq_fr)
def reader_creator(directory, path_en, path_fr, URL, MD5, dict_en, dict_fr):
# this is a small set of data for test. The original data is too large and will be add later.
URL_TRAIN = 'http://paddlepaddle.bj.bcebos.com/demo/wmt_shrinked_data/wmt14.tgz'
MD5_TRAIN = 'a755315dd01c2c35bde29a744ede23a6'
START = "<s>"
END = "<e>"
UNK = "<unk>"
UNK_IDX = 2
def __read_to_dict__(tar_file, dict_size):
def __to_dict__(fd, size):
out_dict = dict()
for line_count, line in enumerate(fd):
if line_count < size:
out_dict[line.strip()] = line_count
else:
break
return out_dict
with tarfile.open(tar_file, mode='r') as f:
names = [
each_item.name for each_item in f
if each_item.name.endswith("src.dict")
]
assert len(names) == 1
src_dict = __to_dict__(f.extractfile(names[0]), dict_size)
names = [
each_item.name for each_item in f
if each_item.name.endswith("trg.dict")
]
assert len(names) == 1
trg_dict = __to_dict__(f.extractfile(names[0]), dict_size)
return src_dict, trg_dict
def reader_creator(tar_file, file_name, dict_size):
def reader():
if not os.path.exists(path_en) or not os.path.exists(path_fr):
with tarfile.open(
paddle.v2.dataset.common.download(URL, 'wmt14', MD5)) as tf:
tf.extractall(directory)
f_en = open(path_en)
f_fr = open(path_fr)
UNK_en = dict_en['<unk>']
UNK_fr = dict_fr['<unk>']
for en, fr in itertools.izip(f_en, f_fr):
src_ids = [dict_en.get(w, UNK_en) for w in en.strip().split()]
tar_ids = [
dict_fr.get(w, UNK_fr)
for w in ['<s>'] + fr.strip().split() + ['<e>']
src_dict, trg_dict = __read_to_dict__(tar_file, dict_size)
with tarfile.open(tar_file, mode='r') as f:
names = [
each_item.name for each_item in f
if each_item.name.endswith(file_name)
]
# remove sequence whose length > 80 in training mode
if len(src_ids) == 0 or len(tar_ids) <= 1 or len(
src_ids) > 80 or len(tar_ids) > 80:
continue
yield src_ids, tar_ids[:-1], tar_ids[1:]
f_en.close()
f_fr.close()
for name in names:
for line in f.extractfile(name):
line_split = line.strip().split('\t')
if len(line_split) != 2:
continue
src_seq = line_split[0] # one source sequence
src_words = src_seq.split()
src_ids = [
src_dict.get(w, UNK_IDX)
for w in [START] + src_words + [END]
]
trg_seq = line_split[1] # one target sequence
trg_words = trg_seq.split()
trg_ids = [trg_dict.get(w, UNK_IDX) for w in trg_words]
# remove sequence whose length > 80 in training mode
if len(src_ids) > 80 or len(trg_ids) > 80:
continue
trg_ids_next = trg_ids + [trg_dict[END]]
trg_ids = [trg_dict[START]] + trg_ids
yield src_ids, trg_ids, trg_ids_next
return reader
def train(dict_en, dict_fr):
directory = './wmt14-data'
return reader_creator(directory, directory + '/train/train.en',
directory + '/train/train.fr', URL_TRAIN, MD5_TRAIN,
dict_en, dict_fr)
def train(dict_size):
return reader_creator(
paddle.v2.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN),
'train/train', dict_size)
def test(dict_en, dict_fr):
directory = './wmt14-data'
return reader_creator(directory, directory + '/dev/ntst1213.en',
directory + '/dev/ntst1213.fr', URL_DEV_TEST,
MD5_DEV_TEST, dict_en, dict_fr)
def test(dict_size):
return reader_creator(
paddle.v2.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN),
'test/test', dict_size)
......@@ -21,13 +21,8 @@ class Inference(object):
self.__gradient_machine__ = gm
self.__data_types__ = topo.data_type()
def iter_infer(self,
input=None,
batch_size=None,
reader=None,
reader_dict=None):
if reader_dict is None:
reader_dict = self.default_reader_dict()
def iter_infer(self, input=None, batch_size=None, reader=None,
feeding=None):
if reader is None:
assert input is not None and isinstance(input, collections.Iterable)
......@@ -51,7 +46,7 @@ class Inference(object):
raise ValueError("User should set either input or reader, "
"should not set them both.")
feeder = DataFeeder(self.__data_types__, reader_dict)
feeder = DataFeeder(self.__data_types__, feeding)
self.__gradient_machine__.start()
for data_batch in reader():
yield self.__gradient_machine__.forwardTest(feeder(data_batch))
......@@ -74,19 +69,13 @@ class Inference(object):
else:
return retv
def default_reader_dict(self):
reader_dict = dict()
for i, tp in enumerate(self.__data_types__):
reader_dict[tp[0]] = i
return reader_dict
def infer(output,
parameters,
input=None,
batch_size=None,
reader=None,
reader_dict=None,
feeding=None,
field='value'):
"""
Infer a neural network by given neural network output and parameters. The
......@@ -113,7 +102,7 @@ def infer(output,
:param reader: input data reader creator in batch. If this field is set, the
`input` and `batch_size` will be ignored.
:type reader: callable
:param reader_dict: Reader dictionary. Default could generate from input
:param feeding: Reader dictionary. Default could generate from input
value.
:param field: The prediction field. It should in [`value`, `ids`]. `value`
means return the prediction probabilities, `ids` means return
......@@ -129,4 +118,4 @@ def infer(output,
input=input,
batch_size=batch_size,
reader=reader,
reader_dict=reader_dict)
feeding=feeding)
import numpy as np
import py_paddle.swig_paddle as api
from paddle.proto.ParameterConfig_pb2 import ParameterConfig
import struct
import tarfile
import cStringIO
from topology import Topology
__all__ = ['Parameters', 'create']
......@@ -122,6 +124,12 @@ class Parameters(object):
if len(self.__gradient_machines__) == 0:
# create new parameter in python numpy.
if len(self.__tmp_params__) != 0:
ret_list = [
mat for name, mat in self.__tmp_params__ if name == key
]
if len(ret_list) == 1:
return ret_list[0]
return np.ndarray(shape=shape, dtype=np.float32)
else:
for each_gradient_machine in self.__gradient_machines__:
......@@ -228,6 +236,67 @@ class Parameters(object):
self.__gradient_machines__.append(gradient_machine)
def serialize(self, name, f):
"""
:param name:
:param f:
:type f: file
:return:
"""
param = self.get(name)
size = reduce(lambda a, b: a * b, param.shape)
f.write(struct.pack("IIQ", 0, 4, size))
param = param.astype(np.float32)
f.write(param.tobytes())
def deserialize(self, name, f):
"""
:param name:
:param f:
:type f: file
:return:
"""
f.read(16) # header
arr = np.frombuffer(f.read(), dtype=np.float32)
self.set(name, arr.reshape(self.get_shape(name)))
def to_tar(self, f):
tar = tarfile.TarFile(fileobj=f, mode='w')
for nm in self.names():
buf = cStringIO.StringIO()
self.serialize(nm, buf)
tarinfo = tarfile.TarInfo(name=nm)
buf.seek(0)
tarinfo.size = len(buf.getvalue())
tar.addfile(tarinfo, buf)
conf = self.__param_conf__[nm]
confStr = conf.SerializeToString()
tarinfo = tarfile.TarInfo(name="%s.protobuf" % nm)
tarinfo.size = len(confStr)
buf = cStringIO.StringIO(confStr)
buf.seek(0)
tar.addfile(tarinfo, fileobj=buf)
@staticmethod
def from_tar(f):
params = Parameters()
tar = tarfile.TarFile(fileobj=f, mode='r')
for finfo in tar:
assert isinstance(finfo, tarfile.TarInfo)
if finfo.name.endswith('.protobuf'):
f = tar.extractfile(finfo)
conf = ParameterConfig()
conf.ParseFromString(f.read())
params.__append_config__(conf)
for param_name in params.names():
f = tar.extractfile(param_name)
params.deserialize(param_name, f)
return params
def __get_parameter_in_gradient_machine__(gradient_machine, name):
"""
......
......@@ -22,7 +22,7 @@ cd $SCRIPTPATH
$1 -m pip install ../../../../paddle/dist/*.whl
test_list="test_data_feeder.py"
test_list="test_data_feeder.py test_parameters.py"
export PYTHONPATH=$PWD/../../../../python/
......
import unittest
import sys
try:
import py_paddle
del py_paddle
except ImportError:
print >> sys.stderr, "It seems swig of Paddle is not installed, this " \
"unittest will not be run."
sys.exit(0)
import paddle.v2.parameters as parameters
from paddle.proto.ParameterConfig_pb2 import ParameterConfig
import random
import cStringIO
import numpy
def __rand_param_config__(name):
conf = ParameterConfig()
conf.name = name
size = 1
for i in xrange(2):
dim = random.randint(1, 1000)
conf.dims.append(dim)
size *= dim
conf.size = size
assert conf.IsInitialized()
return conf
class TestParameters(unittest.TestCase):
def test_serialization(self):
params = parameters.Parameters()
params.__append_config__(__rand_param_config__("param_0"))
params.__append_config__(__rand_param_config__("param_1"))
for name in params.names():
param = params.get(name)
param[:] = numpy.random.uniform(
-1.0, 1.0, size=params.get_shape(name))
params.set(name, param)
tmp_file = cStringIO.StringIO()
params.to_tar(tmp_file)
tmp_file.seek(0)
params_dup = parameters.Parameters.from_tar(tmp_file)
self.assertEqual(params_dup.names(), params.names())
for name in params.names():
self.assertEqual(params.get_shape(name), params_dup.get_shape(name))
p0 = params.get(name)
p1 = params_dup.get(name)
self.assertTrue(numpy.isclose(p0, p1).all())
if __name__ == '__main__':
unittest.main()
......@@ -57,11 +57,11 @@ class SGD(object):
self.__topology_in_proto__, api.CREATE_MODE_NORMAL,
self.__optimizer__.enable_types())
assert isinstance(gm, api.GradientMachine)
parameters.append_gradient_machine(gm)
self.__gradient_machine__ = gm
self.__gradient_machine__.randParameters()
parameters.append_gradient_machine(gm)
def train(self, reader, num_passes=1, event_handler=None, reader_dict=None):
def train(self, reader, num_passes=1, event_handler=None, feeding=None):
"""
Training method. Will train num_passes of input data.
......@@ -70,14 +70,13 @@ class SGD(object):
:param event_handler: Event handler. A method will be invoked when event
occurred.
:type event_handler: (BaseEvent) => None
:param feeding: Feeding is a map of neural network input name and array
index that reader returns.
:type feeding: dict
:return:
"""
if event_handler is None:
event_handler = default_event_handler
if reader_dict is None:
reader_dict = self.default_reader_dict()
__check_train_args__(**locals())
updater = self.__optimizer__.create_local_updater()
......@@ -89,9 +88,7 @@ class SGD(object):
pass_evaluator = self.__gradient_machine__.makeEvaluator()
assert isinstance(pass_evaluator, api.Evaluator)
out_args = api.Arguments.createArguments(0)
feeder = DataFeeder(self.__data_types__, reader_dict)
feeder = DataFeeder(self.__data_types__, feeding)
for pass_id in xrange(num_passes):
event_handler(v2_event.BeginPass(pass_id))
pass_evaluator.start()
......@@ -125,17 +122,8 @@ class SGD(object):
event_handler(v2_event.EndPass(pass_id, evaluator=pass_evaluator))
self.__gradient_machine__.finish()
def default_reader_dict(self):
reader_dict = dict()
for i, tp in enumerate(self.__data_types__):
reader_dict[tp[0]] = i
return reader_dict
def test(self, reader, reader_dict=None):
if reader_dict is None:
reader_dict = self.default_reader_dict()
feeder = DataFeeder(self.__data_types__, reader_dict)
def test(self, reader, feeding=None):
feeder = DataFeeder(self.__data_types__, feeding)
evaluator = self.__gradient_machine__.makeEvaluator()
out_args = api.Arguments.createArguments(0)
evaluator.start()
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册