diff --git a/demo/image_classification/api_v2_train.py b/demo/image_classification/api_v2_train.py index e0fc0e04bbd21f691caa1ce3fb95c8a7065d1b3f..53cffa6fb4e8b2e19725f4f44bf7b9ffffb25232 100644 --- a/demo/image_classification/api_v2_train.py +++ b/demo/image_classification/api_v2_train.py @@ -13,9 +13,10 @@ # limitations under the License import sys + import paddle.v2 as paddle + from api_v2_vgg import vgg_bn_drop -from api_v2_resnet import resnet_cifar10 def main(): @@ -23,16 +24,16 @@ def main(): classdim = 10 # PaddlePaddle init - paddle.init(use_gpu=True, trainer_count=1) + paddle.init(use_gpu=False, trainer_count=1) image = paddle.layer.data( name="image", type=paddle.data_type.dense_vector(datadim)) # Add neural network config # option 1. resnet - net = resnet_cifar10(image, depth=32) + # net = resnet_cifar10(image, depth=32) # option 2. vgg - # net = vgg_bn_drop(image) + net = vgg_bn_drop(image) out = paddle.layer.fc(input=net, size=classdim, @@ -68,8 +69,8 @@ def main(): result = trainer.test( reader=paddle.batch( paddle.dataset.cifar.test10(), batch_size=128), - reader_dict={'image': 0, - 'label': 1}) + feeding={'image': 0, + 'label': 1}) print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics) # Create trainer @@ -83,8 +84,8 @@ def main(): batch_size=128), num_passes=5, event_handler=event_handler, - reader_dict={'image': 0, - 'label': 1}) + feeding={'image': 0, + 'label': 1}) if __name__ == '__main__': diff --git a/demo/introduction/api_train_v2.py b/demo/introduction/api_train_v2.py index 75dd65f9fc8cd8e7fab5bf30a6337574a645e89f..84125c3b4b621a128fd488ff7fa374a75f620bf1 100644 --- a/demo/introduction/api_train_v2.py +++ b/demo/introduction/api_train_v2.py @@ -30,26 +30,26 @@ def main(): def event_handler(event): if isinstance(event, paddle.event.EndIteration): if event.batch_id % 100 == 0: - print "Pass %d, Batch %d, Cost %f, %s" % ( - event.pass_id, event.batch_id, event.cost, event.metrics) + print "Pass %d, Batch %d, Cost %f" % ( + event.pass_id, event.batch_id, event.cost) if isinstance(event, paddle.event.EndPass): - result = trainer.test( - reader=paddle.reader.batched( - uci_housing.test(), batch_size=2), - reader_dict={'x': 0, + if (event.pass_id + 1) % 10 == 0: + result = trainer.test( + reader=paddle.batch( + uci_housing.test(), batch_size=2), + feeding={'x': 0, 'y': 1}) - if event.pass_id % 10 == 0: - print "Test %d, %s" % (event.pass_id, result.metrics) + print "Test %d, %.2f" % (event.pass_id, result.cost) # training trainer.train( - reader=paddle.reader.batched( + reader=paddle.batch( paddle.reader.shuffle( uci_housing.train(), buf_size=500), batch_size=2), - reader_dict={'x': 0, - 'y': 1}, + feeding={'x': 0, + 'y': 1}, event_handler=event_handler, num_passes=30) diff --git a/demo/mnist/.gitignore b/demo/mnist/.gitignore index 8bd9837523ccf98e6e72d5b82934b7b104816217..7e61d5e3a0cabd46d4185454d46610ac2ee2e63f 100644 --- a/demo/mnist/.gitignore +++ b/demo/mnist/.gitignore @@ -5,3 +5,6 @@ plot.png train.log *pyc .ipynb_checkpoints +params.pkl +params.tar +params.tar.gz diff --git a/demo/mnist/api_train_v2.py b/demo/mnist/api_train_v2.py index d0aca5eb1d94a36a66a2e1c146adb54851baef71..75c2f08132dcae291ea4d5d70edfe804a702dd18 100644 --- a/demo/mnist/api_train_v2.py +++ b/demo/mnist/api_train_v2.py @@ -1,4 +1,5 @@ import paddle.v2 as paddle +import gzip def softmax_regression(img): @@ -71,7 +72,11 @@ def main(): cost = paddle.layer.classification_cost(input=predict, label=label) - parameters = paddle.parameters.create(cost) + try: + with gzip.open('params.tar.gz', 'r') as f: + parameters = paddle.parameters.Parameters.from_tar(f) + except IOError: + parameters = paddle.parameters.create(cost) optimizer = paddle.optimizer.Momentum( learning_rate=0.1 / 128.0, @@ -86,10 +91,18 @@ def main(): def event_handler(event): if isinstance(event, paddle.event.EndIteration): - if event.batch_id % 100 == 0: - print "Pass %d, Batch %d, Cost %f, %s" % ( - event.pass_id, event.batch_id, event.cost, event.metrics) - if isinstance(event, paddle.event.EndPass): + if event.batch_id % 1000 == 0: + result = trainer.test(reader=paddle.batch( + paddle.dataset.mnist.test(), batch_size=256)) + + print "Pass %d, Batch %d, Cost %f, %s, Testing metrics %s" % ( + event.pass_id, event.batch_id, event.cost, event.metrics, + result.metrics) + + with gzip.open('params.tar.gz', 'w') as f: + parameters.to_tar(f) + + elif isinstance(event, paddle.event.EndPass): result = trainer.test(reader=paddle.batch( paddle.dataset.mnist.test(), batch_size=128)) print "Test with Pass %d, Cost %f, %s\n" % ( diff --git a/demo/semantic_role_labeling/api_train_v2.py b/demo/semantic_role_labeling/api_train_v2.py index 15db922b97abc5ae79f095edfd632604eec8ab94..036cad4b0a32357bb42580ef577a1eba558be8fe 100644 --- a/demo/semantic_role_labeling/api_train_v2.py +++ b/demo/semantic_role_labeling/api_train_v2.py @@ -163,11 +163,11 @@ def main(): update_equation=optimizer) parameters.set('emb', load_parameter(conll05.get_embedding(), 44068, 32)) - trn_reader = paddle.reader.batched( + trn_reader = paddle.batch( paddle.reader.shuffle( conll05.test(), buf_size=8192), batch_size=10) - reader_dict = { + feeding = { 'word_data': 0, 'ctx_n2_data': 1, 'ctx_n1_data': 2, @@ -183,7 +183,7 @@ def main(): reader=trn_reader, event_handler=event_handler, num_passes=10000, - reader_dict=reader_dict) + feeding=feeding) if __name__ == '__main__': diff --git a/demo/sentiment/train_v2.py b/demo/sentiment/train_v2.py index 3a266e74ea93068cad2757d0076a4ae664ad4cf8..fd7243cbe69977dcabc9ecf1d060e62f313b8cfd 100644 --- a/demo/sentiment/train_v2.py +++ b/demo/sentiment/train_v2.py @@ -18,11 +18,7 @@ from paddle.trainer_config_helpers.poolings import MaxPooling import paddle.v2 as paddle -def convolution_net(input_dim, - class_dim=2, - emb_dim=128, - hid_dim=128, - is_predict=False): +def convolution_net(input_dim, class_dim=2, emb_dim=128, hid_dim=128): data = paddle.layer.data("word", paddle.data_type.integer_value_sequence(input_dim)) emb = paddle.layer.embedding(input=data, size=emb_dim) @@ -42,8 +38,7 @@ def stacked_lstm_net(input_dim, class_dim=2, emb_dim=128, hid_dim=512, - stacked_num=3, - is_predict=False): + stacked_num=3): """ A Wrapper for sentiment classification task. This network uses bi-directional recurrent network, @@ -110,7 +105,7 @@ def stacked_lstm_net(input_dim, if __name__ == '__main__': # init - paddle.init(use_gpu=True, trainer_count=4) + paddle.init(use_gpu=False, trainer_count=4) # network config print 'load dictionary...' @@ -143,11 +138,11 @@ if __name__ == '__main__': sys.stdout.flush() if isinstance(event, paddle.event.EndPass): result = trainer.test( - reader=paddle.reader.batched( + reader=paddle.batch( lambda: paddle.dataset.imdb.test(word_dict), batch_size=128), - reader_dict={'word': 0, - 'label': 1}) + feeding={'word': 0, + 'label': 1}) print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics) # create trainer @@ -156,11 +151,11 @@ if __name__ == '__main__': update_equation=adam_optimizer) trainer.train( - reader=paddle.reader.batched( + reader=paddle.batch( paddle.reader.shuffle( lambda: paddle.dataset.imdb.train(word_dict), buf_size=1000), batch_size=100), event_handler=event_handler, - reader_dict={'word': 0, - 'label': 1}, + feeding={'word': 0, + 'label': 1}, num_passes=10) diff --git a/demo/seqToseq/api_train_v2.py b/demo/seqToseq/api_train_v2.py index a5f59ec379738eb5bed3e7559739cae38582ed06..6efd254e7a48703a69c9f09dd35d41ba7ac5689a 100644 --- a/demo/seqToseq/api_train_v2.py +++ b/demo/seqToseq/api_train_v2.py @@ -1,76 +1,106 @@ -import os - import paddle.v2 as paddle -from seqToseq_net_v2 import seqToseq_net_v2 - -# Data Definiation. -# TODO:This code should be merged to dataset package. -data_dir = "./data/pre-wmt14" -src_lang_dict = os.path.join(data_dir, 'src.dict') -trg_lang_dict = os.path.join(data_dir, 'trg.dict') - -source_dict_dim = len(open(src_lang_dict, "r").readlines()) -target_dict_dim = len(open(trg_lang_dict, "r").readlines()) - - -def read_to_dict(dict_path): - with open(dict_path, "r") as fin: - out_dict = { - line.strip(): line_count - for line_count, line in enumerate(fin) - } - return out_dict - - -src_dict = read_to_dict(src_lang_dict) -trg_dict = read_to_dict(trg_lang_dict) - -train_list = os.path.join(data_dir, 'train.list') -test_list = os.path.join(data_dir, 'test.list') - -UNK_IDX = 2 -START = "" -END = "" - -def _get_ids(s, dictionary): - words = s.strip().split() - return [dictionary[START]] + \ - [dictionary.get(w, UNK_IDX) for w in words] + \ - [dictionary[END]] - - -def train_reader(file_name): - def reader(): - with open(file_name, 'r') as f: - for line_count, line in enumerate(f): - line_split = line.strip().split('\t') - if len(line_split) != 2: - continue - src_seq = line_split[0] # one source sequence - src_ids = _get_ids(src_seq, src_dict) - - trg_seq = line_split[1] # one target sequence - trg_words = trg_seq.split() - trg_ids = [trg_dict.get(w, UNK_IDX) for w in trg_words] - - # remove sequence whose length > 80 in training mode - if len(src_ids) > 80 or len(trg_ids) > 80: - continue - trg_ids_next = trg_ids + [trg_dict[END]] - trg_ids = [trg_dict[START]] + trg_ids - - yield src_ids, trg_ids, trg_ids_next - - return reader +def seqToseq_net(source_dict_dim, target_dict_dim): + ### Network Architecture + word_vector_dim = 512 # dimension of word vector + decoder_size = 512 # dimension of hidden unit in GRU Decoder network + encoder_size = 512 # dimension of hidden unit in GRU Encoder network + + #### Encoder + src_word_id = paddle.layer.data( + name='source_language_word', + type=paddle.data_type.integer_value_sequence(source_dict_dim)) + src_embedding = paddle.layer.embedding( + input=src_word_id, + size=word_vector_dim, + param_attr=paddle.attr.ParamAttr(name='_source_language_embedding')) + src_forward = paddle.networks.simple_gru( + input=src_embedding, size=encoder_size) + src_backward = paddle.networks.simple_gru( + input=src_embedding, size=encoder_size, reverse=True) + encoded_vector = paddle.layer.concat(input=[src_forward, src_backward]) + + #### Decoder + with paddle.layer.mixed(size=decoder_size) as encoded_proj: + encoded_proj += paddle.layer.full_matrix_projection( + input=encoded_vector) + + backward_first = paddle.layer.first_seq(input=src_backward) + + with paddle.layer.mixed( + size=decoder_size, act=paddle.activation.Tanh()) as decoder_boot: + decoder_boot += paddle.layer.full_matrix_projection( + input=backward_first) + + def gru_decoder_with_attention(enc_vec, enc_proj, current_word): + + decoder_mem = paddle.layer.memory( + name='gru_decoder', size=decoder_size, boot_layer=decoder_boot) + + context = paddle.networks.simple_attention( + encoded_sequence=enc_vec, + encoded_proj=enc_proj, + decoder_state=decoder_mem) + + with paddle.layer.mixed(size=decoder_size * 3) as decoder_inputs: + decoder_inputs += paddle.layer.full_matrix_projection(input=context) + decoder_inputs += paddle.layer.full_matrix_projection( + input=current_word) + + gru_step = paddle.layer.gru_step( + name='gru_decoder', + input=decoder_inputs, + output_mem=decoder_mem, + size=decoder_size) + + with paddle.layer.mixed( + size=target_dict_dim, + bias_attr=True, + act=paddle.activation.Softmax()) as out: + out += paddle.layer.full_matrix_projection(input=gru_step) + return out + + decoder_group_name = "decoder_group" + group_input1 = paddle.layer.StaticInputV2(input=encoded_vector, is_seq=True) + group_input2 = paddle.layer.StaticInputV2(input=encoded_proj, is_seq=True) + group_inputs = [group_input1, group_input2] + + trg_embedding = paddle.layer.embedding( + input=paddle.layer.data( + name='target_language_word', + type=paddle.data_type.integer_value_sequence(target_dict_dim)), + size=word_vector_dim, + param_attr=paddle.attr.ParamAttr(name='_target_language_embedding')) + group_inputs.append(trg_embedding) + + # For decoder equipped with attention mechanism, in training, + # target embeding (the groudtruth) is the data input, + # while encoded source sequence is accessed to as an unbounded memory. + # Here, the StaticInput defines a read-only memory + # for the recurrent_group. + decoder = paddle.layer.recurrent_group( + name=decoder_group_name, + step=gru_decoder_with_attention, + input=group_inputs) + + lbl = paddle.layer.data( + name='target_language_next_word', + type=paddle.data_type.integer_value_sequence(target_dict_dim)) + cost = paddle.layer.classification_cost(input=decoder, label=lbl) + + return cost def main(): paddle.init(use_gpu=False, trainer_count=1) + # source and target dict dim. + dict_size = 30000 + source_dict_dim = target_dict_dim = dict_size + # define network topology - cost = seqToseq_net_v2(source_dict_dim, target_dict_dim) + cost = seqToseq_net(source_dict_dim, target_dict_dim) parameters = paddle.parameters.create(cost) # define optimize method and trainer @@ -80,15 +110,15 @@ def main(): update_equation=optimizer) # define data reader - reader_dict = { + feeding = { 'source_language_word': 0, 'target_language_word': 1, 'target_language_next_word': 2 } - wmt14_reader = paddle.reader.batched( + wmt14_reader = paddle.batch( paddle.reader.shuffle( - train_reader("data/pre-wmt14/train/train"), buf_size=8192), + paddle.dataset.wmt14.train(dict_size=dict_size), buf_size=8192), batch_size=5) # define event_handler callback @@ -103,7 +133,7 @@ def main(): reader=wmt14_reader, event_handler=event_handler, num_passes=10000, - reader_dict=reader_dict) + feeding=feeding) if __name__ == '__main__': diff --git a/demo/seqToseq/seqToseq_net_v2.py b/demo/seqToseq/seqToseq_net_v2.py deleted file mode 100644 index 058a6789d7094c71492ed9772ed5594c4c0c8f84..0000000000000000000000000000000000000000 --- a/demo/seqToseq/seqToseq_net_v2.py +++ /dev/null @@ -1,92 +0,0 @@ -import paddle.v2 as paddle - - -def seqToseq_net_v2(source_dict_dim, target_dict_dim): - ### Network Architecture - word_vector_dim = 512 # dimension of word vector - decoder_size = 512 # dimension of hidden unit in GRU Decoder network - encoder_size = 512 # dimension of hidden unit in GRU Encoder network - - #### Encoder - src_word_id = paddle.layer.data( - name='source_language_word', - type=paddle.data_type.integer_value_sequence(source_dict_dim)) - src_embedding = paddle.layer.embedding( - input=src_word_id, - size=word_vector_dim, - param_attr=paddle.attr.ParamAttr(name='_source_language_embedding')) - src_forward = paddle.networks.simple_gru( - input=src_embedding, size=encoder_size) - src_backward = paddle.networks.simple_gru( - input=src_embedding, size=encoder_size, reverse=True) - encoded_vector = paddle.layer.concat(input=[src_forward, src_backward]) - - #### Decoder - with paddle.layer.mixed(size=decoder_size) as encoded_proj: - encoded_proj += paddle.layer.full_matrix_projection( - input=encoded_vector) - - backward_first = paddle.layer.first_seq(input=src_backward) - - with paddle.layer.mixed( - size=decoder_size, act=paddle.activation.Tanh()) as decoder_boot: - decoder_boot += paddle.layer.full_matrix_projection( - input=backward_first) - - def gru_decoder_with_attention(enc_vec, enc_proj, current_word): - - decoder_mem = paddle.layer.memory( - name='gru_decoder', size=decoder_size, boot_layer=decoder_boot) - - context = paddle.networks.simple_attention( - encoded_sequence=enc_vec, - encoded_proj=enc_proj, - decoder_state=decoder_mem) - - with paddle.layer.mixed(size=decoder_size * 3) as decoder_inputs: - decoder_inputs += paddle.layer.full_matrix_projection(input=context) - decoder_inputs += paddle.layer.full_matrix_projection( - input=current_word) - - gru_step = paddle.layer.gru_step( - name='gru_decoder', - input=decoder_inputs, - output_mem=decoder_mem, - size=decoder_size) - - with paddle.layer.mixed( - size=target_dict_dim, - bias_attr=True, - act=paddle.activation.Softmax()) as out: - out += paddle.layer.full_matrix_projection(input=gru_step) - return out - - decoder_group_name = "decoder_group" - group_input1 = paddle.layer.StaticInputV2(input=encoded_vector, is_seq=True) - group_input2 = paddle.layer.StaticInputV2(input=encoded_proj, is_seq=True) - group_inputs = [group_input1, group_input2] - - trg_embedding = paddle.layer.embedding( - input=paddle.layer.data( - name='target_language_word', - type=paddle.data_type.integer_value_sequence(target_dict_dim)), - size=word_vector_dim, - param_attr=paddle.attr.ParamAttr(name='_target_language_embedding')) - group_inputs.append(trg_embedding) - - # For decoder equipped with attention mechanism, in training, - # target embeding (the groudtruth) is the data input, - # while encoded source sequence is accessed to as an unbounded memory. - # Here, the StaticInput defines a read-only memory - # for the recurrent_group. - decoder = paddle.layer.recurrent_group( - name=decoder_group_name, - step=gru_decoder_with_attention, - input=group_inputs) - - lbl = paddle.layer.data( - name='target_language_next_word', - type=paddle.data_type.integer_value_sequence(target_dict_dim)) - cost = paddle.layer.classification_cost(input=decoder, label=lbl) - - return cost diff --git a/python/paddle/trainer_config_helpers/layer_math.py b/python/paddle/trainer_config_helpers/layer_math.py index 2d9e36f2b0d379d907634208a45c69efa9dbba3d..544b443825393c9a31c0375724d4ca63dac5c5eb 100644 --- a/python/paddle/trainer_config_helpers/layer_math.py +++ b/python/paddle/trainer_config_helpers/layer_math.py @@ -39,6 +39,7 @@ register_unary_math_op('abs', act.AbsActivation()) register_unary_math_op('sigmoid', act.SigmoidActivation()) register_unary_math_op('tanh', act.TanhActivation()) register_unary_math_op('square', act.SquareActivation()) +register_unary_math_op('relu', act.ReluActivation()) def add(layeroutput, other): diff --git a/python/paddle/trainer_config_helpers/tests/configs/math_ops.py b/python/paddle/trainer_config_helpers/tests/configs/math_ops.py index 3331c10d6497f58eb135208bd7abe48aacfb10ae..24c901c8ee3ab1c90fc14fbff761db06345a6313 100644 --- a/python/paddle/trainer_config_helpers/tests/configs/math_ops.py +++ b/python/paddle/trainer_config_helpers/tests/configs/math_ops.py @@ -7,8 +7,9 @@ x = layer_math.exp(x) x = layer_math.log(x) x = layer_math.abs(x) x = layer_math.sigmoid(x) +x = layer_math.tanh(x) x = layer_math.square(x) -x = layer_math.square(x) +x = layer_math.relu(x) y = 1 + x y = y + 1 y = x + y diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/math_ops.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/math_ops.protostr index da8da1b541f37a09654202f68232b99e4dac9f61..9b8a2ad9687d313e6c5017c2d7331eddf539af92 100644 --- a/python/paddle/trainer_config_helpers/tests/configs/protostr/math_ops.protostr +++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/math_ops.protostr @@ -65,13 +65,28 @@ layers { } } } +layers { + name: "__tanh_0__" + type: "mixed" + size: 100 + active_type: "tanh" + inputs { + input_layer_name: "__sigmoid_0__" + proj_conf { + type: "identity" + name: "___tanh_0__.w0" + input_size: 100 + output_size: 100 + } + } +} layers { name: "__square_0__" type: "mixed" size: 100 active_type: "square" inputs { - input_layer_name: "__sigmoid_0__" + input_layer_name: "__tanh_0__" proj_conf { type: "identity" name: "___square_0__.w0" @@ -81,15 +96,15 @@ layers { } } layers { - name: "__square_1__" + name: "__relu_0__" type: "mixed" size: 100 - active_type: "square" + active_type: "relu" inputs { input_layer_name: "__square_0__" proj_conf { type: "identity" - name: "___square_1__.w0" + name: "___relu_0__.w0" input_size: 100 output_size: 100 } @@ -101,7 +116,7 @@ layers { size: 100 active_type: "" inputs { - input_layer_name: "__square_1__" + input_layer_name: "__relu_0__" } slope: 1.0 intercept: 1 @@ -123,7 +138,7 @@ layers { size: 100 active_type: "" inputs { - input_layer_name: "__square_1__" + input_layer_name: "__relu_0__" proj_conf { type: "identity" name: "___mixed_0__.w0" @@ -147,7 +162,7 @@ layers { size: 100 active_type: "" inputs { - input_layer_name: "__square_1__" + input_layer_name: "__relu_0__" } slope: -1.0 intercept: 0.0 @@ -339,8 +354,9 @@ sub_models { layer_names: "__log_0__" layer_names: "__abs_0__" layer_names: "__sigmoid_0__" + layer_names: "__tanh_0__" layer_names: "__square_0__" - layer_names: "__square_1__" + layer_names: "__relu_0__" layer_names: "__slope_intercept_layer_0__" layer_names: "__slope_intercept_layer_1__" layer_names: "__mixed_0__" diff --git a/python/paddle/v2/data_feeder.py b/python/paddle/v2/data_feeder.py index b7465238be8138e47913b7bd3f1c669ed9653958..ba77fecf21eecf9115cc1b20720383b790294eb0 100644 --- a/python/paddle/v2/data_feeder.py +++ b/python/paddle/v2/data_feeder.py @@ -14,11 +14,18 @@ from py_paddle import DataProviderConverter -import data_type +import paddle.trainer.PyDataProvider2 as pydp2 __all__ = ['DataFeeder'] +def default_feeding_map(data_types): + reader_dict = dict() + for i, tp in enumerate(data_types): + reader_dict[tp[0]] = i + return reader_dict + + class DataFeeder(DataProviderConverter): """ DataFeeder converts the data returned by paddle.reader into a data structure @@ -60,16 +67,21 @@ class DataFeeder(DataProviderConverter): :type data_types: list :param reader_dict: A dictionary to specify the position of each data in the input data. - :type reader_dict: dict + :type feeding: dict """ - def __init__(self, data_types, reader_dict): + def __init__(self, data_types, feeding=None): self.input_names = [] input_types = [] - self.reader_dict = reader_dict + if feeding is None: + feeding = default_feeding_map(data_types) + + self.feeding = feeding for each in data_types: self.input_names.append(each[0]) - assert isinstance(each[1], data_type.InputType) + if not isinstance(each[1], pydp2.InputType): + raise TypeError("second item in each data_type should be an " + "InputType") input_types.append(each[1]) DataProviderConverter.__init__(self, input_types) @@ -90,7 +102,7 @@ class DataFeeder(DataProviderConverter): for each in data: reorder = [] for name in self.input_names: - reorder.append(each[self.reader_dict[name]]) + reorder.append(each[self.feeding[name]]) retv.append(reorder) return retv diff --git a/python/paddle/v2/dataset/wmt14.py b/python/paddle/v2/dataset/wmt14.py index 9904848b5d3ef95dc331fc0ba1a98f29f8b1dfeb..f5a16d51477f9cfbf0cd32af54098406fbbd2b41 100644 --- a/python/paddle/v2/dataset/wmt14.py +++ b/python/paddle/v2/dataset/wmt14.py @@ -14,129 +14,92 @@ """ wmt14 dataset """ -import paddle.v2.dataset.common import tarfile -import os.path -import itertools + +import paddle.v2.dataset.common __all__ = ['train', 'test', 'build_dict'] URL_DEV_TEST = 'http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/data/dev+test.tgz' MD5_DEV_TEST = '7d7897317ddd8ba0ae5c5fa7248d3ff5' -URL_TRAIN = 'http://localhost:8000/train.tgz' -MD5_TRAIN = '72de99da2830ea5a3a2c4eb36092bbc7' - - -def word_count(f, word_freq=None): - add = paddle.v2.dataset.common.dict_add - if word_freq == None: - word_freq = {} - - for l in f: - for w in l.strip().split(): - add(word_freq, w) - add(word_freq, '') - add(word_freq, '') - - return word_freq - - -def get_word_dix(word_freq): - TYPO_FREQ = 50 - word_freq = filter(lambda x: x[1] > TYPO_FREQ, word_freq.items()) - word_freq_sorted = sorted(word_freq, key=lambda x: (-x[1], x[0])) - words, _ = list(zip(*word_freq_sorted)) - word_idx = dict(zip(words, xrange(len(words)))) - word_idx[''] = len(words) - return word_idx - - -def get_word_freq(train, dev): - word_freq = word_count(train, word_count(dev)) - if '' in word_freq: - # remove for now, since we will set it as last index - del word_freq[''] - return word_freq - - -def build_dict(): - base_dir = './wmt14-data' - train_en_filename = base_dir + '/train/train.en' - train_fr_filename = base_dir + '/train/train.fr' - dev_en_filename = base_dir + '/dev/ntst1213.en' - dev_fr_filename = base_dir + '/dev/ntst1213.fr' - - if not os.path.exists(train_en_filename) or not os.path.exists( - train_fr_filename): - with tarfile.open( - paddle.v2.dataset.common.download(URL_TRAIN, 'wmt14', - MD5_TRAIN)) as tf: - tf.extractall(base_dir) - - if not os.path.exists(dev_en_filename) or not os.path.exists( - dev_fr_filename): - with tarfile.open( - paddle.v2.dataset.common.download(URL_DEV_TEST, 'wmt14', - MD5_DEV_TEST)) as tf: - tf.extractall(base_dir) - - f_en = open(train_en_filename) - f_fr = open(train_fr_filename) - f_en_dev = open(dev_en_filename) - f_fr_dev = open(dev_fr_filename) - - word_freq_en = get_word_freq(f_en, f_en_dev) - word_freq_fr = get_word_freq(f_fr, f_fr_dev) - - f_en.close() - f_fr.close() - f_en_dev.close() - f_fr_dev.close() - - return get_word_dix(word_freq_en), get_word_dix(word_freq_fr) - - -def reader_creator(directory, path_en, path_fr, URL, MD5, dict_en, dict_fr): +# this is a small set of data for test. The original data is too large and will be add later. +URL_TRAIN = 'http://paddlepaddle.bj.bcebos.com/demo/wmt_shrinked_data/wmt14.tgz' +MD5_TRAIN = 'a755315dd01c2c35bde29a744ede23a6' + +START = "" +END = "" +UNK = "" +UNK_IDX = 2 + + +def __read_to_dict__(tar_file, dict_size): + def __to_dict__(fd, size): + out_dict = dict() + for line_count, line in enumerate(fd): + if line_count < size: + out_dict[line.strip()] = line_count + else: + break + return out_dict + + with tarfile.open(tar_file, mode='r') as f: + names = [ + each_item.name for each_item in f + if each_item.name.endswith("src.dict") + ] + assert len(names) == 1 + src_dict = __to_dict__(f.extractfile(names[0]), dict_size) + names = [ + each_item.name for each_item in f + if each_item.name.endswith("trg.dict") + ] + assert len(names) == 1 + trg_dict = __to_dict__(f.extractfile(names[0]), dict_size) + return src_dict, trg_dict + + +def reader_creator(tar_file, file_name, dict_size): def reader(): - if not os.path.exists(path_en) or not os.path.exists(path_fr): - with tarfile.open( - paddle.v2.dataset.common.download(URL, 'wmt14', MD5)) as tf: - tf.extractall(directory) - - f_en = open(path_en) - f_fr = open(path_fr) - UNK_en = dict_en[''] - UNK_fr = dict_fr[''] - - for en, fr in itertools.izip(f_en, f_fr): - src_ids = [dict_en.get(w, UNK_en) for w in en.strip().split()] - tar_ids = [ - dict_fr.get(w, UNK_fr) - for w in [''] + fr.strip().split() + [''] + src_dict, trg_dict = __read_to_dict__(tar_file, dict_size) + with tarfile.open(tar_file, mode='r') as f: + names = [ + each_item.name for each_item in f + if each_item.name.endswith(file_name) ] - - # remove sequence whose length > 80 in training mode - if len(src_ids) == 0 or len(tar_ids) <= 1 or len( - src_ids) > 80 or len(tar_ids) > 80: - continue - - yield src_ids, tar_ids[:-1], tar_ids[1:] - - f_en.close() - f_fr.close() + for name in names: + for line in f.extractfile(name): + line_split = line.strip().split('\t') + if len(line_split) != 2: + continue + src_seq = line_split[0] # one source sequence + src_words = src_seq.split() + src_ids = [ + src_dict.get(w, UNK_IDX) + for w in [START] + src_words + [END] + ] + + trg_seq = line_split[1] # one target sequence + trg_words = trg_seq.split() + trg_ids = [trg_dict.get(w, UNK_IDX) for w in trg_words] + + # remove sequence whose length > 80 in training mode + if len(src_ids) > 80 or len(trg_ids) > 80: + continue + trg_ids_next = trg_ids + [trg_dict[END]] + trg_ids = [trg_dict[START]] + trg_ids + + yield src_ids, trg_ids, trg_ids_next return reader -def train(dict_en, dict_fr): - directory = './wmt14-data' - return reader_creator(directory, directory + '/train/train.en', - directory + '/train/train.fr', URL_TRAIN, MD5_TRAIN, - dict_en, dict_fr) +def train(dict_size): + return reader_creator( + paddle.v2.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN), + 'train/train', dict_size) -def test(dict_en, dict_fr): - directory = './wmt14-data' - return reader_creator(directory, directory + '/dev/ntst1213.en', - directory + '/dev/ntst1213.fr', URL_DEV_TEST, - MD5_DEV_TEST, dict_en, dict_fr) +def test(dict_size): + return reader_creator( + paddle.v2.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN), + 'test/test', dict_size) diff --git a/python/paddle/v2/inference.py b/python/paddle/v2/inference.py index 7d7dc82de987cb23d12c411c08e0e529afefe58b..2ad4d9d1ab037a210b81b4ee63bce267f296bd83 100644 --- a/python/paddle/v2/inference.py +++ b/python/paddle/v2/inference.py @@ -21,13 +21,8 @@ class Inference(object): self.__gradient_machine__ = gm self.__data_types__ = topo.data_type() - def iter_infer(self, - input=None, - batch_size=None, - reader=None, - reader_dict=None): - if reader_dict is None: - reader_dict = self.default_reader_dict() + def iter_infer(self, input=None, batch_size=None, reader=None, + feeding=None): if reader is None: assert input is not None and isinstance(input, collections.Iterable) @@ -51,7 +46,7 @@ class Inference(object): raise ValueError("User should set either input or reader, " "should not set them both.") - feeder = DataFeeder(self.__data_types__, reader_dict) + feeder = DataFeeder(self.__data_types__, feeding) self.__gradient_machine__.start() for data_batch in reader(): yield self.__gradient_machine__.forwardTest(feeder(data_batch)) @@ -74,19 +69,13 @@ class Inference(object): else: return retv - def default_reader_dict(self): - reader_dict = dict() - for i, tp in enumerate(self.__data_types__): - reader_dict[tp[0]] = i - return reader_dict - def infer(output, parameters, input=None, batch_size=None, reader=None, - reader_dict=None, + feeding=None, field='value'): """ Infer a neural network by given neural network output and parameters. The @@ -113,7 +102,7 @@ def infer(output, :param reader: input data reader creator in batch. If this field is set, the `input` and `batch_size` will be ignored. :type reader: callable - :param reader_dict: Reader dictionary. Default could generate from input + :param feeding: Reader dictionary. Default could generate from input value. :param field: The prediction field. It should in [`value`, `ids`]. `value` means return the prediction probabilities, `ids` means return @@ -129,4 +118,4 @@ def infer(output, input=input, batch_size=batch_size, reader=reader, - reader_dict=reader_dict) + feeding=feeding) diff --git a/python/paddle/v2/parameters.py b/python/paddle/v2/parameters.py index 7ee388f067c1868e4d940770342a0519aebbfa5e..05dc5c68dd97b00fb15b74564a32313430c45345 100644 --- a/python/paddle/v2/parameters.py +++ b/python/paddle/v2/parameters.py @@ -1,7 +1,9 @@ import numpy as np import py_paddle.swig_paddle as api from paddle.proto.ParameterConfig_pb2 import ParameterConfig - +import struct +import tarfile +import cStringIO from topology import Topology __all__ = ['Parameters', 'create'] @@ -122,6 +124,12 @@ class Parameters(object): if len(self.__gradient_machines__) == 0: # create new parameter in python numpy. + if len(self.__tmp_params__) != 0: + ret_list = [ + mat for name, mat in self.__tmp_params__ if name == key + ] + if len(ret_list) == 1: + return ret_list[0] return np.ndarray(shape=shape, dtype=np.float32) else: for each_gradient_machine in self.__gradient_machines__: @@ -228,6 +236,67 @@ class Parameters(object): self.__gradient_machines__.append(gradient_machine) + def serialize(self, name, f): + """ + + :param name: + :param f: + :type f: file + :return: + """ + param = self.get(name) + size = reduce(lambda a, b: a * b, param.shape) + f.write(struct.pack("IIQ", 0, 4, size)) + param = param.astype(np.float32) + f.write(param.tobytes()) + + def deserialize(self, name, f): + """ + + :param name: + :param f: + :type f: file + :return: + """ + f.read(16) # header + arr = np.frombuffer(f.read(), dtype=np.float32) + self.set(name, arr.reshape(self.get_shape(name))) + + def to_tar(self, f): + tar = tarfile.TarFile(fileobj=f, mode='w') + for nm in self.names(): + buf = cStringIO.StringIO() + self.serialize(nm, buf) + tarinfo = tarfile.TarInfo(name=nm) + buf.seek(0) + tarinfo.size = len(buf.getvalue()) + tar.addfile(tarinfo, buf) + + conf = self.__param_conf__[nm] + confStr = conf.SerializeToString() + tarinfo = tarfile.TarInfo(name="%s.protobuf" % nm) + tarinfo.size = len(confStr) + buf = cStringIO.StringIO(confStr) + buf.seek(0) + tar.addfile(tarinfo, fileobj=buf) + + @staticmethod + def from_tar(f): + params = Parameters() + tar = tarfile.TarFile(fileobj=f, mode='r') + for finfo in tar: + assert isinstance(finfo, tarfile.TarInfo) + if finfo.name.endswith('.protobuf'): + f = tar.extractfile(finfo) + conf = ParameterConfig() + conf.ParseFromString(f.read()) + params.__append_config__(conf) + + for param_name in params.names(): + f = tar.extractfile(param_name) + params.deserialize(param_name, f) + return params + def __get_parameter_in_gradient_machine__(gradient_machine, name): """ diff --git a/python/paddle/v2/tests/run_tests.sh b/python/paddle/v2/tests/run_tests.sh index b96f54fe9cc78a436bc67e6c542b6e842aba997b..dda1b1bd222a9f226db1a4bd730e9637ab882196 100755 --- a/python/paddle/v2/tests/run_tests.sh +++ b/python/paddle/v2/tests/run_tests.sh @@ -22,7 +22,7 @@ cd $SCRIPTPATH $1 -m pip install ../../../../paddle/dist/*.whl -test_list="test_data_feeder.py" +test_list="test_data_feeder.py test_parameters.py" export PYTHONPATH=$PWD/../../../../python/ diff --git a/python/paddle/v2/tests/test_parameters.py b/python/paddle/v2/tests/test_parameters.py new file mode 100644 index 0000000000000000000000000000000000000000..ebb182caab6430862a8e4da2ae4ea6b1e72f726c --- /dev/null +++ b/python/paddle/v2/tests/test_parameters.py @@ -0,0 +1,60 @@ +import unittest +import sys + +try: + import py_paddle + + del py_paddle +except ImportError: + print >> sys.stderr, "It seems swig of Paddle is not installed, this " \ + "unittest will not be run." + sys.exit(0) + +import paddle.v2.parameters as parameters +from paddle.proto.ParameterConfig_pb2 import ParameterConfig +import random +import cStringIO +import numpy + + +def __rand_param_config__(name): + conf = ParameterConfig() + conf.name = name + size = 1 + for i in xrange(2): + dim = random.randint(1, 1000) + conf.dims.append(dim) + size *= dim + conf.size = size + assert conf.IsInitialized() + return conf + + +class TestParameters(unittest.TestCase): + def test_serialization(self): + params = parameters.Parameters() + params.__append_config__(__rand_param_config__("param_0")) + params.__append_config__(__rand_param_config__("param_1")) + + for name in params.names(): + param = params.get(name) + param[:] = numpy.random.uniform( + -1.0, 1.0, size=params.get_shape(name)) + params.set(name, param) + + tmp_file = cStringIO.StringIO() + params.to_tar(tmp_file) + tmp_file.seek(0) + params_dup = parameters.Parameters.from_tar(tmp_file) + + self.assertEqual(params_dup.names(), params.names()) + + for name in params.names(): + self.assertEqual(params.get_shape(name), params_dup.get_shape(name)) + p0 = params.get(name) + p1 = params_dup.get(name) + self.assertTrue(numpy.isclose(p0, p1).all()) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/v2/trainer.py b/python/paddle/v2/trainer.py index d20b6fd552185bea9d1573bbac90e1256e1dba5f..7bd3e2c565ee00c91402e7dea36c7393fb1a9bdf 100644 --- a/python/paddle/v2/trainer.py +++ b/python/paddle/v2/trainer.py @@ -57,11 +57,11 @@ class SGD(object): self.__topology_in_proto__, api.CREATE_MODE_NORMAL, self.__optimizer__.enable_types()) assert isinstance(gm, api.GradientMachine) - parameters.append_gradient_machine(gm) self.__gradient_machine__ = gm self.__gradient_machine__.randParameters() + parameters.append_gradient_machine(gm) - def train(self, reader, num_passes=1, event_handler=None, reader_dict=None): + def train(self, reader, num_passes=1, event_handler=None, feeding=None): """ Training method. Will train num_passes of input data. @@ -70,14 +70,13 @@ class SGD(object): :param event_handler: Event handler. A method will be invoked when event occurred. :type event_handler: (BaseEvent) => None + :param feeding: Feeding is a map of neural network input name and array + index that reader returns. + :type feeding: dict :return: """ if event_handler is None: event_handler = default_event_handler - - if reader_dict is None: - reader_dict = self.default_reader_dict() - __check_train_args__(**locals()) updater = self.__optimizer__.create_local_updater() @@ -89,9 +88,7 @@ class SGD(object): pass_evaluator = self.__gradient_machine__.makeEvaluator() assert isinstance(pass_evaluator, api.Evaluator) out_args = api.Arguments.createArguments(0) - - feeder = DataFeeder(self.__data_types__, reader_dict) - + feeder = DataFeeder(self.__data_types__, feeding) for pass_id in xrange(num_passes): event_handler(v2_event.BeginPass(pass_id)) pass_evaluator.start() @@ -125,17 +122,8 @@ class SGD(object): event_handler(v2_event.EndPass(pass_id, evaluator=pass_evaluator)) self.__gradient_machine__.finish() - def default_reader_dict(self): - reader_dict = dict() - for i, tp in enumerate(self.__data_types__): - reader_dict[tp[0]] = i - return reader_dict - - def test(self, reader, reader_dict=None): - if reader_dict is None: - reader_dict = self.default_reader_dict() - - feeder = DataFeeder(self.__data_types__, reader_dict) + def test(self, reader, feeding=None): + feeder = DataFeeder(self.__data_types__, feeding) evaluator = self.__gradient_machine__.makeEvaluator() out_args = api.Arguments.createArguments(0) evaluator.start()