diff --git a/cmake/coverallsGcovJsons.cmake b/cmake/coverallsGcovJsons.cmake index ae3530c3a0eeb79ddbcbf4f2e99be75aa7968a2f..ad9a10cb8616159b9e3aff445e698cb2edb92820 100644 --- a/cmake/coverallsGcovJsons.cmake +++ b/cmake/coverallsGcovJsons.cmake @@ -110,14 +110,13 @@ endmacro() # Get the coverage data. file(GLOB_RECURSE GCDA_FILES "${COV_PATH}" "*.gcda") -message("GCDA files:") +message("Process GCDA files:") +message("===============================") # Get a list of all the object directories needed by gcov # (The directories the .gcda files and .o files are found in) # and run gcov on those. foreach(GCDA ${GCDA_FILES}) - message("Process: ${GCDA}") - message("------------------------------------------------------------------------------") get_filename_component(GCDA_DIR ${GCDA} PATH) # @@ -135,7 +134,7 @@ foreach(GCDA ${GCDA_FILES}) # If -p is not specified then the file is named only "the_file.c.gcov" # execute_process( - COMMAND ${GCOV_EXECUTABLE} -p -o ${GCDA_DIR} ${GCDA} + COMMAND "${GCOV_EXECUTABLE} -p -o ${GCDA_DIR} ${GCDA} >/dev/null" WORKING_DIRECTORY ${GCDA_DIR} ) endforeach() @@ -383,7 +382,6 @@ foreach(NOT_COVERED_SRC ${COVERAGE_SRCS_REMAINING}) set(GCOV_FILE_COVERAGE "${GCOV_FILE_COVERAGE}]") # Generate the final JSON for this file. - message("Generate JSON for non-gcov file: ${NOT_COVERED_SRC}...") string(CONFIGURE ${SRC_FILE_TEMPLATE} FILE_JSON) set(JSON_GCOV_FILES "${JSON_GCOV_FILES}${FILE_JSON}, ") endforeach() diff --git a/demo/introduction/api_train_v2.py b/demo/introduction/api_train_v2.py new file mode 100644 index 0000000000000000000000000000000000000000..75dd65f9fc8cd8e7fab5bf30a6337574a645e89f --- /dev/null +++ b/demo/introduction/api_train_v2.py @@ -0,0 +1,58 @@ +import paddle.v2 as paddle +import paddle.v2.dataset.uci_housing as uci_housing + + +def main(): + # init + paddle.init(use_gpu=False, trainer_count=1) + + # network config + x = paddle.layer.data(name='x', type=paddle.data_type.dense_vector(13)) + y_predict = paddle.layer.fc(input=x, + param_attr=paddle.attr.Param(name='w'), + size=1, + act=paddle.activation.Linear(), + bias_attr=paddle.attr.Param(name='b')) + y = paddle.layer.data(name='y', type=paddle.data_type.dense_vector(1)) + cost = paddle.layer.regression_cost(input=y_predict, label=y) + + # create parameters + parameters = paddle.parameters.create(cost) + + # create optimizer + optimizer = paddle.optimizer.Momentum(momentum=0) + + trainer = paddle.trainer.SGD(cost=cost, + parameters=parameters, + update_equation=optimizer) + + # event_handler to print training and testing info + def event_handler(event): + if isinstance(event, paddle.event.EndIteration): + if event.batch_id % 100 == 0: + print "Pass %d, Batch %d, Cost %f, %s" % ( + event.pass_id, event.batch_id, event.cost, event.metrics) + + if isinstance(event, paddle.event.EndPass): + result = trainer.test( + reader=paddle.reader.batched( + uci_housing.test(), batch_size=2), + reader_dict={'x': 0, + 'y': 1}) + if event.pass_id % 10 == 0: + print "Test %d, %s" % (event.pass_id, result.metrics) + + # training + trainer.train( + reader=paddle.reader.batched( + paddle.reader.shuffle( + uci_housing.train(), buf_size=500), + batch_size=2), + reader_dict={'x': 0, + 'y': 1}, + event_handler=event_handler, + num_passes=30) + + +if __name__ == '__main__': + main() diff --git a/demo/mnist/api_train_v2.py b/demo/mnist/api_train_v2.py index 06beb7024d1fd07dc327cb4c09d74e1b89a7b8ff..9b7ebde5007047e34da9274bf8165cfa527e2cf1 100644 --- a/demo/mnist/api_train_v2.py +++ b/demo/mnist/api_train_v2.py @@ -1,6 +1,59 @@ import paddle.v2 as paddle +def softmax_regression(img): + predict = paddle.layer.fc(input=img, + size=10, + act=paddle.activation.Softmax()) + return predict + + +def multilayer_perceptron(img): + # The first fully-connected layer + hidden1 = paddle.layer.fc(input=img, size=128, act=paddle.activation.Relu()) + # The second fully-connected layer and the according activation function + hidden2 = paddle.layer.fc(input=hidden1, + size=64, + act=paddle.activation.Relu()) + # The thrid fully-connected layer, note that the hidden size should be 10, + # which is the number of unique digits + predict = paddle.layer.fc(input=hidden2, + size=10, + act=paddle.activation.Softmax()) + return predict + + +def convolutional_neural_network(img): + # first conv layer + conv_pool_1 = paddle.networks.simple_img_conv_pool( + input=img, + filter_size=5, + num_filters=20, + num_channel=1, + pool_size=2, + pool_stride=2, + act=paddle.activation.Tanh()) + # second conv layer + conv_pool_2 = paddle.networks.simple_img_conv_pool( + input=conv_pool_1, + filter_size=5, + num_filters=50, + num_channel=20, + pool_size=2, + pool_stride=2, + act=paddle.activation.Tanh()) + # The first fully-connected layer + fc1 = paddle.layer.fc(input=conv_pool_2, + size=128, + act=paddle.activation.Tanh()) + # The softmax layer, note that the hidden size should be 10, + # which is the number of unique digits + predict = paddle.layer.fc(input=fc1, + size=10, + act=paddle.activation.Softmax()) + return predict + + def main(): paddle.init(use_gpu=False, trainer_count=1) @@ -9,45 +62,58 @@ def main(): name='pixel', type=paddle.data_type.dense_vector(784)) label = paddle.layer.data( name='label', type=paddle.data_type.integer_value(10)) - hidden1 = paddle.layer.fc(input=images, size=200) - hidden2 = paddle.layer.fc(input=hidden1, size=200) - inference = paddle.layer.fc(input=hidden2, - size=10, - act=paddle.activation.Softmax()) - cost = paddle.layer.classification_cost(input=inference, label=label) + + # Here we can build the prediction network in different ways. Please + # choose one by uncomment corresponding line. + predict = softmax_regression(images) + #predict = multilayer_perceptron(images) + #predict = convolutional_neural_network(images) + + cost = paddle.layer.classification_cost(input=predict, label=label) parameters = paddle.parameters.create(cost) - adam_optimizer = paddle.optimizer.Adam(learning_rate=0.01) + optimizer = paddle.optimizer.Momentum( + learning_rate=0.1 / 128.0, + momentum=0.9, + regularization=paddle.optimizer.L2Regularization(rate=0.0005 * 128)) trainer = paddle.trainer.SGD(cost=cost, parameters=parameters, - update_equation=adam_optimizer) + update_equation=optimizer) + + lists = [] def event_handler(event): if isinstance(event, paddle.event.EndIteration): - if event.batch_id % 1000 == 0: - result = trainer.test(reader=paddle.reader.batched( - paddle.dataset.mnist.test(), batch_size=256)) - - print "Pass %d, Batch %d, Cost %f, %s, Testing metrics %s" % ( - event.pass_id, event.batch_id, event.cost, event.metrics, - result.metrics) - - else: - pass + if event.batch_id % 100 == 0: + print "Pass %d, Batch %d, Cost %f, %s" % ( + event.pass_id, event.batch_id, event.cost, event.metrics) + if isinstance(event, paddle.event.EndPass): + result = trainer.test(reader=paddle.reader.batched( + paddle.dataset.mnist.test(), batch_size=128)) + print "Test with Pass %d, Cost %f, %s\n" % ( + event.pass_id, result.cost, result.metrics) + lists.append((event.pass_id, result.cost, + result.metrics['classification_error_evaluator'])) trainer.train( reader=paddle.reader.batched( paddle.reader.shuffle( paddle.dataset.mnist.train(), buf_size=8192), - batch_size=32), - event_handler=event_handler) + batch_size=128), + event_handler=event_handler, + num_passes=100) + + # find the best pass + best = sorted(lists, key=lambda list: float(list[1]))[0] + print 'Best pass is %s, testing Avgcost is %s' % (best[0], best[1]) + print 'The classification accuracy is %.2f%%' % (100 - float(best[2]) * 100) # output is a softmax layer. It returns probabilities. # Shape should be (100, 10) probs = paddle.infer( - output=inference, + output=predict, parameters=parameters, reader=paddle.reader.batched( paddle.reader.firstn( diff --git a/demo/semantic_role_labeling/api_train_v2.py b/demo/semantic_role_labeling/api_train_v2.py index e946a792b5f51ab54355aac0b6e9aef51ae815fb..15db922b97abc5ae79f095edfd632604eec8ab94 100644 --- a/demo/semantic_role_labeling/api_train_v2.py +++ b/demo/semantic_role_labeling/api_train_v2.py @@ -167,8 +167,23 @@ def main(): paddle.reader.shuffle( conll05.test(), buf_size=8192), batch_size=10) + reader_dict = { + 'word_data': 0, + 'ctx_n2_data': 1, + 'ctx_n1_data': 2, + 'ctx_0_data': 3, + 'ctx_p1_data': 4, + 'ctx_p2_data': 5, + 'verb_data': 6, + 'mark_data': 7, + 'target': 8 + } + trainer.train( - reader=trn_reader, event_handler=event_handler, num_passes=10000) + reader=trn_reader, + event_handler=event_handler, + num_passes=10000, + reader_dict=reader_dict) if __name__ == '__main__': diff --git a/demo/sentiment/train_v2.py b/demo/sentiment/train_v2.py new file mode 100644 index 0000000000000000000000000000000000000000..3a266e74ea93068cad2757d0076a4ae664ad4cf8 --- /dev/null +++ b/demo/sentiment/train_v2.py @@ -0,0 +1,166 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import sys +import paddle.trainer_config_helpers.attrs as attrs +from paddle.trainer_config_helpers.poolings import MaxPooling +import paddle.v2 as paddle + + +def convolution_net(input_dim, + class_dim=2, + emb_dim=128, + hid_dim=128, + is_predict=False): + data = paddle.layer.data("word", + paddle.data_type.integer_value_sequence(input_dim)) + emb = paddle.layer.embedding(input=data, size=emb_dim) + conv_3 = paddle.networks.sequence_conv_pool( + input=emb, context_len=3, hidden_size=hid_dim) + conv_4 = paddle.networks.sequence_conv_pool( + input=emb, context_len=4, hidden_size=hid_dim) + output = paddle.layer.fc(input=[conv_3, conv_4], + size=class_dim, + act=paddle.activation.Softmax()) + lbl = paddle.layer.data("label", paddle.data_type.integer_value(2)) + cost = paddle.layer.classification_cost(input=output, label=lbl) + return cost + + +def stacked_lstm_net(input_dim, + class_dim=2, + emb_dim=128, + hid_dim=512, + stacked_num=3, + is_predict=False): + """ + A Wrapper for sentiment classification task. + This network uses bi-directional recurrent network, + consisting three LSTM layers. This configure is referred to + the paper as following url, but use fewer layrs. + http://www.aclweb.org/anthology/P15-1109 + + input_dim: here is word dictionary dimension. + class_dim: number of categories. + emb_dim: dimension of word embedding. + hid_dim: dimension of hidden layer. + stacked_num: number of stacked lstm-hidden layer. + is_predict: is predicting or not. + Some layers is not needed in network when predicting. + """ + assert stacked_num % 2 == 1 + + layer_attr = attrs.ExtraLayerAttribute(drop_rate=0.5) + fc_para_attr = attrs.ParameterAttribute(learning_rate=1e-3) + lstm_para_attr = attrs.ParameterAttribute(initial_std=0., learning_rate=1.) + para_attr = [fc_para_attr, lstm_para_attr] + bias_attr = attrs.ParameterAttribute(initial_std=0., l2_rate=0.) + relu = paddle.activation.Relu() + linear = paddle.activation.Linear() + + data = paddle.layer.data("word", + paddle.data_type.integer_value_sequence(input_dim)) + emb = paddle.layer.embedding(input=data, size=emb_dim) + + fc1 = paddle.layer.fc(input=emb, + size=hid_dim, + act=linear, + bias_attr=bias_attr) + lstm1 = paddle.layer.lstmemory( + input=fc1, act=relu, bias_attr=bias_attr, layer_attr=layer_attr) + + inputs = [fc1, lstm1] + for i in range(2, stacked_num + 1): + fc = paddle.layer.fc(input=inputs, + size=hid_dim, + act=linear, + param_attr=para_attr, + bias_attr=bias_attr) + lstm = paddle.layer.lstmemory( + input=fc, + reverse=(i % 2) == 0, + act=relu, + bias_attr=bias_attr, + layer_attr=layer_attr) + inputs = [fc, lstm] + + fc_last = paddle.layer.pooling(input=inputs[0], pooling_type=MaxPooling()) + lstm_last = paddle.layer.pooling(input=inputs[1], pooling_type=MaxPooling()) + output = paddle.layer.fc(input=[fc_last, lstm_last], + size=class_dim, + act=paddle.activation.Softmax(), + bias_attr=bias_attr, + param_attr=para_attr) + + lbl = paddle.layer.data("label", paddle.data_type.integer_value(2)) + cost = paddle.layer.classification_cost(input=output, label=lbl) + return cost + + +if __name__ == '__main__': + # init + paddle.init(use_gpu=True, trainer_count=4) + + # network config + print 'load dictionary...' + word_dict = paddle.dataset.imdb.word_dict() + dict_dim = len(word_dict) + class_dim = 2 + + # Please choose the way to build the network + # by uncommenting the corresponding line. + cost = convolution_net(dict_dim, class_dim=class_dim) + # cost = stacked_lstm_net(dict_dim, class_dim=class_dim, stacked_num=3) + + # create parameters + parameters = paddle.parameters.create(cost) + + # create optimizer + adam_optimizer = paddle.optimizer.Adam( + learning_rate=2e-3, + regularization=paddle.optimizer.L2Regularization(rate=8e-4), + model_average=paddle.optimizer.ModelAverage(average_window=0.5)) + + # End batch and end pass event handler + def event_handler(event): + if isinstance(event, paddle.event.EndIteration): + if event.batch_id % 100 == 0: + print "\nPass %d, Batch %d, Cost %f, %s" % ( + event.pass_id, event.batch_id, event.cost, event.metrics) + else: + sys.stdout.write('.') + sys.stdout.flush() + if isinstance(event, paddle.event.EndPass): + result = trainer.test( + reader=paddle.reader.batched( + lambda: paddle.dataset.imdb.test(word_dict), + batch_size=128), + reader_dict={'word': 0, + 'label': 1}) + print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics) + + # create trainer + trainer = paddle.trainer.SGD(cost=cost, + parameters=parameters, + update_equation=adam_optimizer) + + trainer.train( + reader=paddle.reader.batched( + paddle.reader.shuffle( + lambda: paddle.dataset.imdb.train(word_dict), buf_size=1000), + batch_size=100), + event_handler=event_handler, + reader_dict={'word': 0, + 'label': 1}, + num_passes=10) diff --git a/demo/seqToseq/api_train_v2.py b/demo/seqToseq/api_train_v2.py new file mode 100644 index 0000000000000000000000000000000000000000..74ae1cf9ec81b00b55202d7c1316c7cb074cfd43 --- /dev/null +++ b/demo/seqToseq/api_train_v2.py @@ -0,0 +1,106 @@ +import os + +import paddle.v2 as paddle + +from seqToseq_net_v2 import seqToseq_net_v2 + +# Data Definiation. +# TODO:This code should be merged to dataset package. +data_dir = "./data/pre-wmt14" +src_lang_dict = os.path.join(data_dir, 'src.dict') +trg_lang_dict = os.path.join(data_dir, 'trg.dict') + +source_dict_dim = len(open(src_lang_dict, "r").readlines()) +target_dict_dim = len(open(trg_lang_dict, "r").readlines()) + + +def read_to_dict(dict_path): + with open(dict_path, "r") as fin: + out_dict = { + line.strip(): line_count + for line_count, line in enumerate(fin) + } + return out_dict + + +src_dict = read_to_dict(src_lang_dict) +trg_dict = read_to_dict(trg_lang_dict) + +train_list = os.path.join(data_dir, 'train.list') +test_list = os.path.join(data_dir, 'test.list') + +UNK_IDX = 2 +START = "" +END = "" + + +def _get_ids(s, dictionary): + words = s.strip().split() + return [dictionary[START]] + \ + [dictionary.get(w, UNK_IDX) for w in words] + \ + [dictionary[END]] + + +def train_reader(file_name): + def reader(): + with open(file_name, 'r') as f: + for line_count, line in enumerate(f): + line_split = line.strip().split('\t') + if len(line_split) != 2: + continue + src_seq = line_split[0] # one source sequence + src_ids = _get_ids(src_seq, src_dict) + + trg_seq = line_split[1] # one target sequence + trg_words = trg_seq.split() + trg_ids = [trg_dict.get(w, UNK_IDX) for w in trg_words] + + # remove sequence whose length > 80 in training mode + if len(src_ids) > 80 or len(trg_ids) > 80: + continue + trg_ids_next = trg_ids + [trg_dict[END]] + trg_ids = [trg_dict[START]] + trg_ids + + yield src_ids, trg_ids, trg_ids_next + + return reader + + +def main(): + paddle.init(use_gpu=False, trainer_count=1) + + # define network topology + cost = seqToseq_net_v2(source_dict_dim, target_dict_dim) + parameters = paddle.parameters.create(cost) + optimizer = paddle.optimizer.Adam(learning_rate=1e-4) + + def event_handler(event): + if isinstance(event, paddle.event.EndIteration): + if event.batch_id % 10 == 0: + print "Pass %d, Batch %d, Cost %f, %s" % ( + event.pass_id, event.batch_id, event.cost, event.metrics) + + trainer = paddle.trainer.SGD(cost=cost, + parameters=parameters, + update_equation=optimizer) + + reader_dict = { + 'source_language_word': 0, + 'target_language_word': 1, + 'target_language_next_word': 2 + } + + trn_reader = paddle.reader.batched( + paddle.reader.shuffle( + train_reader("data/pre-wmt14/train/train"), buf_size=8192), + batch_size=5) + + trainer.train( + reader=trn_reader, + event_handler=event_handler, + num_passes=10000, + reader_dict=reader_dict) + + +if __name__ == '__main__': + main() diff --git a/demo/seqToseq/seqToseq_net_v2.py b/demo/seqToseq/seqToseq_net_v2.py new file mode 100644 index 0000000000000000000000000000000000000000..1ac95686b43526732ca0bc9bf8f4e07589e24807 --- /dev/null +++ b/demo/seqToseq/seqToseq_net_v2.py @@ -0,0 +1,90 @@ +import paddle.v2.activation as activation +import paddle.v2.attr as attr +import paddle.v2.data_type as data_type +import paddle.v2.layer as layer +import paddle.v2.networks as networks + + +def seqToseq_net_v2(source_dict_dim, target_dict_dim): + ### Network Architecture + word_vector_dim = 512 # dimension of word vector + decoder_size = 512 # dimension of hidden unit in GRU Decoder network + encoder_size = 512 # dimension of hidden unit in GRU Encoder network + + #### Encoder + src_word_id = layer.data( + name='source_language_word', + type=data_type.integer_value_sequence(source_dict_dim)) + src_embedding = layer.embedding( + input=src_word_id, + size=word_vector_dim, + param_attr=attr.ParamAttr(name='_source_language_embedding')) + src_forward = networks.simple_gru(input=src_embedding, size=encoder_size) + src_backward = networks.simple_gru( + input=src_embedding, size=encoder_size, reverse=True) + encoded_vector = layer.concat(input=[src_forward, src_backward]) + + #### Decoder + with layer.mixed(size=decoder_size) as encoded_proj: + encoded_proj += layer.full_matrix_projection(input=encoded_vector) + + backward_first = layer.first_seq(input=src_backward) + + with layer.mixed(size=decoder_size, act=activation.Tanh()) as decoder_boot: + decoder_boot += layer.full_matrix_projection(input=backward_first) + + def gru_decoder_with_attention(enc_vec, enc_proj, current_word): + + decoder_mem = layer.memory( + name='gru_decoder', size=decoder_size, boot_layer=decoder_boot) + + context = networks.simple_attention( + encoded_sequence=enc_vec, + encoded_proj=enc_proj, + decoder_state=decoder_mem) + + with layer.mixed(size=decoder_size * 3) as decoder_inputs: + decoder_inputs += layer.full_matrix_projection(input=context) + decoder_inputs += layer.full_matrix_projection(input=current_word) + + gru_step = layer.gru_step( + name='gru_decoder', + input=decoder_inputs, + output_mem=decoder_mem, + size=decoder_size) + + with layer.mixed( + size=target_dict_dim, bias_attr=True, + act=activation.Softmax()) as out: + out += layer.full_matrix_projection(input=gru_step) + return out + + decoder_group_name = "decoder_group" + group_input1 = layer.StaticInputV2(input=encoded_vector, is_seq=True) + group_input2 = layer.StaticInputV2(input=encoded_proj, is_seq=True) + group_inputs = [group_input1, group_input2] + + trg_embedding = layer.embedding( + input=layer.data( + name='target_language_word', + type=data_type.integer_value_sequence(target_dict_dim)), + size=word_vector_dim, + param_attr=attr.ParamAttr(name='_target_language_embedding')) + group_inputs.append(trg_embedding) + + # For decoder equipped with attention mechanism, in training, + # target embeding (the groudtruth) is the data input, + # while encoded source sequence is accessed to as an unbounded memory. + # Here, the StaticInput defines a read-only memory + # for the recurrent_group. + decoder = layer.recurrent_group( + name=decoder_group_name, + step=gru_decoder_with_attention, + input=group_inputs) + + lbl = layer.data( + name='target_language_next_word', + type=data_type.integer_value_sequence(target_dict_dim)) + cost = layer.classification_cost(input=decoder, label=lbl) + + return cost diff --git a/doc/design/reader/README.md b/doc/design/reader/README.md index 17d52b9e20b8130688028092421f4b33f44763ac..03119fdd74502a4534c2e6a576580ce96a721c7e 100644 --- a/doc/design/reader/README.md +++ b/doc/design/reader/README.md @@ -4,9 +4,10 @@ At training and testing time, PaddlePaddle programs need to read data. To ease t - A *reader* is a function that reads data (from file, network, random number generator, etc) and yields data items. - A *reader creator* is a function that returns a reader function. -- A *reader* decorator is a function, which accepts one or more readers, and returns a reader. +- A *reader decorator* is a function, which accepts one or more readers, and returns a reader. +- A *batch reader* is a function that reads data (from *reader*, file, network, random number generator, etc) and yields a batch of data items. -and provide frequently used reader creators and reader decorators. +and provide function which converts reader to batch reader, frequently used reader creators and reader decorators. ## Data Reader Interface @@ -37,9 +38,54 @@ def reader_creator_random_imageand_label(widht, height, label): return reader ``` +## Batch Reader Interface + +*batch reader* can be any function with no parameter that creates a iterable (anything can be used in `for x in iterable`). The output of the iterable should be a batch (list) of data items. Each item inside the list must be a tuple. + +Here are valid outputs: +```python +# a mini batch of three data items. Each data item consist three columns of data, each of which is 1. +[(1, 1, 1), +(2, 2, 2), +(3, 3, 3)] + +# a mini batch of three data items, each data item is a list (single column). +[([1,1,1],), +([2,2,2],), +([3,3,3],), +``` + +Please note that each item inside the list must be a tuple, below is an invalid output: +```python + # wrong, [1,1,1] needs to be inside a tuple: ([1,1,1],). + # Otherwise it's ambiguous whether [1,1,1] means a single column of data [1, 1, 1], + # or three column of datas, each of which is 1. +[[1,1,1], +[2,2,2], +[3,3,3]] +``` + +It's easy to convert from reader to batch reader: +```python +mnist_train = paddle.dataset.mnist.train() +mnist_train_batch_reader = paddle.batch(mnist_train, 128) +``` + +Also easy to create custom batch reader: +```python +def custom_batch_reader(): + while True: + batch = [] + for i in xrange(128): + batch.append((numpy.random.uniform(-1, 1, 28*28),)) # note that it's a tuple being appended. + yield batch + +mnist_random_image_batch_reader = custom_batch_reader +``` + ## Usage -data reader, mapping from item(s) read to data layer, batch size and number of total pass will be passed into `paddle.train`: +batch reader, mapping from item(s) read to data layer, batch size and number of total pass will be passed into `paddle.train`: ```python # two data layer is created: @@ -47,8 +93,8 @@ image_layer = paddle.layer.data("image", ...) label_layer = paddle.layer.data("label", ...) # ... - -paddle.train(paddle.dataset.mnist, {"image":0, "label":1}, 128, 10, ...) +batch_reader = paddle.batch(paddle.dataset.mnist.train(), 128) +paddle.train(batch_reader, {"image":0, "label":1}, 128, 10, ...) ``` ## Data Reader Decorator @@ -64,7 +110,7 @@ Since reading data may take time and training can not proceed without data. It i Use `paddle.reader.buffered` to prefetch data: ```python -buffered_reader = paddle.reader.buffered(paddle.dataset.mnist, 100) +buffered_reader = paddle.reader.buffered(paddle.dataset.mnist.train(), 100) ``` `buffered_reader` will try to buffer (prefetch) `100` data entries. @@ -91,10 +137,10 @@ def reader_creator_bool(t): true_reader = reader_creator_bool(True) false_reader = reader_creator_bool(False) -reader = paddle.reader.compose(paddle.dataset.mnist, data_reader_creator_random_image(20, 20), true_reader, false_reader) -# Skipped 1 because paddle.dataset.mnist produces two items per data entry. +reader = paddle.reader.compose(paddle.dataset.mnist.train(), data_reader_creator_random_image(20, 20), true_reader, false_reader) +# Skipped 1 because paddle.dataset.mnist.train() produces two items per data entry. # And we don't care second item at this time. -paddle.train(reader, {"true_image":0, "fake_image": 2, "true_label": 3, "false_label": 4}, ...) +paddle.train(paddle.batch(reader, 128), {"true_image":0, "fake_image": 2, "true_label": 3, "false_label": 4}, ...) ``` ### Shuffle @@ -103,16 +149,20 @@ Given shuffle buffer size `n`, `paddle.reader.shuffle` will return a data reader Example: ```python -reader = paddle.reader.shuffle(paddle.dataset.mnist, 512) +reader = paddle.reader.shuffle(paddle.dataset.mnist.train(), 512) ``` ## Q & A -### Why return only a single entry, but not a mini batch? +### Why reader return only a single entry, but not a mini batch? + +Always returning a single entry make reusing existing data readers much easier (e.g., if existing reader return not a single entry but 3 entries, training code will be more complex because it need to handle cases like batch size 2). + +We provide function `paddle.batch` to turn (single entry) reader into batch reader. -If a mini batch is returned, data reader need to take care of batch size. But batch size is a concept for training, it makes more sense for user to specify batch size as a parameter for `train`. +### Why do we need batch reader, isn't train take reader and batch_size as arguments sufficient? -Practically, always return a single entry make reusing existing data readers much easier (e.g., if existing reader return not a single entry but 3 entries, training code will be more complex because it need to handle cases like batch size 2). +In most of the case, train taking reader and batch_size as arguments would be sufficent. However sometimes user want to customize order of data entries inside a mini batch. Or even change batch size dynamically. ### Why use a dictionary but not a list to provide mapping? @@ -137,7 +187,7 @@ def image_reader_creator(image_path, label_path, n): # images_reader_creator creates a reader reader = image_reader_creator("/path/to/image_file", "/path/to/label_file", 1024) -paddle.train(reader, {"image":0, "label":1}, ...) +paddle.train(paddle.batch(reader, 128), {"image":0, "label":1}, ...) ``` ### How is `paddle.train` implemented @@ -145,17 +195,8 @@ paddle.train(reader, {"image":0, "label":1}, ...) An example implementation of paddle.train could be: ```python -def make_minibatch(reader, minibatch_size): - def ret(): - r = reader() - buf = [r.next() for x in xrange(minibatch_size)] - while len(buf) > 0: - yield buf - buf = [r.next() for x in xrange(minibatch_size)] - return ret - -def train(reader, mapping, batch_size, total_pass): +def train(batch_reader, mapping, batch_size, total_pass): for pass_idx in range(total_pass): - for mini_batch in make_minibatch(reader): # this loop will never end in online learning. + for mini_batch in batch_reader(): # this loop will never end in online learning. do_forward_backward(mini_batch, mapping) ``` diff --git a/doc/howto/usage/k8s/src/k8s_train/start_paddle.py b/doc/howto/usage/k8s/src/k8s_train/start_paddle.py index f1a770ccb54fbd7d4c3cf6bf134d00d7bf5961ca..935c12bb67e1fe08bc135a7a2220fcd43c548482 100755 --- a/doc/howto/usage/k8s/src/k8s_train/start_paddle.py +++ b/doc/howto/usage/k8s/src/k8s_train/start_paddle.py @@ -132,7 +132,8 @@ def startPaddle(idMap={}, train_args_dict=None): logDir = JOB_PATH_OUTPUT + "/node_" + str(trainerId) if not os.path.exists(JOB_PATH_OUTPUT): os.makedirs(JOB_PATH_OUTPUT) - os.mkdir(logDir) + if not os.path.exists(logDir): + os.mkdir(logDir) copyCommand = 'cp -rf ' + JOB_PATH + \ "/" + str(trainerId) + "/data/*" + " ./data/" os.system(copyCommand) diff --git a/paddle/api/Arguments.cpp b/paddle/api/Arguments.cpp index a3f4bfffc9f074900ebcc52876c04bbfc0e570b2..d49b189e253f7a0792fe3f1fe7c8fdbb7071acd4 100644 --- a/paddle/api/Arguments.cpp +++ b/paddle/api/Arguments.cpp @@ -144,9 +144,7 @@ void Arguments::setSlotSequenceDim(size_t idx, IVector* vec) throw(RangeError) { a.cpuSequenceDims = m->cast(vec->getSharedPtr()); } -float Arguments::sumCosts() const { - return paddle::Argument::sumCosts(m->outputs); -} +float Arguments::sum() const { return paddle::Argument::sum(m->outputs); } int64_t Arguments::getBatchSize(size_t idx) const throw(RangeError) { auto& a = m->getArg(idx); diff --git a/paddle/api/PaddleAPI.h b/paddle/api/PaddleAPI.h index 762f86ac79461558b6a2eb7105ffd05961f5d3e2..c4f5dca26cc6a5e9fdd23ee27b594ced29a25c7a 100644 --- a/paddle/api/PaddleAPI.h +++ b/paddle/api/PaddleAPI.h @@ -453,7 +453,7 @@ public: IVector* vec) throw(RangeError); void setSlotSequenceDim(size_t idx, IVector* vec) throw(RangeError); - float sumCosts() const; + float sum() const; private: static Arguments* createByPaddleArgumentVector(void* ptr); diff --git a/paddle/api/test/testArguments.py b/paddle/api/test/testArguments.py index a04a805d7a64ef906c8388f1241b9ef823e4d9e0..9fe44de94ea6ddb71d2dfbb2243fc86ede0d0531 100644 --- a/paddle/api/test/testArguments.py +++ b/paddle/api/test/testArguments.py @@ -22,7 +22,7 @@ class TestArguments(unittest.TestCase): args = swig_paddle.Arguments.createArguments(1) args.setSlotValue(0, m) - self.assertAlmostEqual(27.0, args.sumCosts()) + self.assertAlmostEqual(27.0, args.sum()) mat = args.getSlotValue(0) assert isinstance(mat, swig_paddle.Matrix) diff --git a/paddle/gserver/tests/LayerGradUtil.cpp b/paddle/gserver/tests/LayerGradUtil.cpp index ae016e74eaa84f7c43a30c09c8c4577e25360c4e..7617af10ba719490d1b33dd297b070cd8c7c292c 100644 --- a/paddle/gserver/tests/LayerGradUtil.cpp +++ b/paddle/gserver/tests/LayerGradUtil.cpp @@ -24,7 +24,7 @@ real getCostSum(LayerPtr& testLayer, MatrixPtr weights) { if (weights) { outArgs[0].value->dotMul(*outArgs[0].value, *weights); } - return Argument::sumCosts(outArgs); + return Argument::sum(outArgs); } real getDiffAndPrint(real newCost1, @@ -241,7 +241,7 @@ void testBatchState(LayerPtr testLayer, std::vector args; args.push_back(out); - EXPECT_EQ(0, Argument::sumCosts(args)) << "testBatchState failed"; + EXPECT_EQ(0, Argument::sum(args)) << "testBatchState failed"; for (size_t seqId = 0; seqId < numSequences; ++seqId) { start[seqId] += seqLens[seqId]; } @@ -672,7 +672,7 @@ void testLayerGradKernel(TestConfig testConf, outArgs[0].value->dotMul(*testLayer->getOutput().value, *weights); } - real cost = Argument::sumCosts(outArgs); + real cost = Argument::sum(outArgs); LOG(INFO) << " cost " << cost; EXPECT_FALSE(std::isnan(cost)); diff --git a/paddle/parameter/Argument.h b/paddle/parameter/Argument.h index 178c068b93ac5fc1e06200984f14da86069cf7e4..9ef44be0cb3b960db1e789f3f26bb66d1fe63c81 100644 --- a/paddle/parameter/Argument.h +++ b/paddle/parameter/Argument.h @@ -163,7 +163,7 @@ struct Argument { : sequenceStartPositions->getData(false); } - static inline real sumCosts(const std::vector& arguments) { + static inline real sum(const std::vector& arguments) { real cost = 0; for (auto& arg : arguments) { if (arg.value) { diff --git a/paddle/pserver/test/CMakeLists.txt b/paddle/pserver/test/CMakeLists.txt index 64654f67d0c2c82f05a5038fb33b220f3cff0f39..6e8f9c37f64b70921e09241089a5a480fd8ca47f 100644 --- a/paddle/pserver/test/CMakeLists.txt +++ b/paddle/pserver/test/CMakeLists.txt @@ -10,9 +10,11 @@ add_test(NAME socket_test add_unittest_without_exec(test_ProtoServer test_ProtoServer.cpp) -add_test(NAME test_ProtoServer - COMMAND ${PROJ_ROOT}/paddle/.set_port.sh -p port - ${CMAKE_CURRENT_BINARY_DIR}/test_ProtoServer) +IF(NOT ON_TRAVIS) + add_test(NAME test_ProtoServer + COMMAND ${PROJ_ROOT}/paddle/.set_port.sh -p port + ${CMAKE_CURRENT_BINARY_DIR}/test_ProtoServer) +ENDIF(NOT ON_TRAVIS) # TODO(yuyang18): Run test_ProtoServer when with rdma # add_test(NAME test_ProtoServerRDMA diff --git a/paddle/setup.py.in b/paddle/setup.py.in index 38621af065913c9edd44958e9fb767c983c00dbb..382d5be6ecfc26b4a524bb6a775bd1a805a34d96 100644 --- a/paddle/setup.py.in +++ b/paddle/setup.py.in @@ -72,6 +72,7 @@ setup(name="py_paddle", packages=['py_paddle'], include_dirs = include_dirs, install_requires = [ + 'nltk>=3.2.2', 'numpy>=1.8.0', # The numpy is required. 'protobuf>=3.0.0' # The paddle protobuf version ], diff --git a/paddle/trainer/Tester.cpp b/paddle/trainer/Tester.cpp index 13aa28ae5d9699d267858d48e46797c756487ddd..80664fa877b324af73e3e3effa11e46eac6294e2 100644 --- a/paddle/trainer/Tester.cpp +++ b/paddle/trainer/Tester.cpp @@ -208,7 +208,7 @@ real Tester::forwardOneBatch(const DataBatch& dataBatch, return 0.0; // In this case, there is no meaning to calculate cost } - return Argument::sumCosts(outArgs); + return Argument::sum(outArgs); } void Tester::testOnePassBatch(int passId) { diff --git a/paddle/trainer/Trainer.cpp b/paddle/trainer/Trainer.cpp index bd84545375117b178d4324f0ad03f5bc35ae925d..b68e29cd5ea223272151e7a8b52d998832f47103 100644 --- a/paddle/trainer/Trainer.cpp +++ b/paddle/trainer/Trainer.cpp @@ -310,7 +310,7 @@ real Trainer::checkGradient() { std::vector outArgs; trainerInternal_.getGradientMachine()->forward(inArgs, &outArgs, PASS_GC); - real cost = Argument::sumCosts(outArgs); + real cost = Argument::sum(outArgs); LOG(INFO) << "original cost=" << cost; trainerInternal_.getGradientMachine()->backward(); @@ -340,7 +340,7 @@ real Trainer::checkGradient() { parameter->getBuf(PARAMETER_VALUE)->copyFrom(newPara); parameter->setValueUpdated(); trainerInternal_.getGradientMachine()->forward(inArgs, &outArgs, PASS_GC); - real newCost1 = Argument::sumCosts(outArgs); + real newCost1 = Argument::sum(outArgs); for (size_t i = 0; i < dim; ++i) { newp[i] = oldp[i] - step * d[i]; @@ -349,7 +349,7 @@ real Trainer::checkGradient() { parameter->getBuf(PARAMETER_VALUE)->copyFrom(newPara); parameter->setValueUpdated(); trainerInternal_.getGradientMachine()->forward(inArgs, &outArgs, PASS_GC); - real newCost2 = Argument::sumCosts(outArgs); + real newCost2 = Argument::sum(outArgs); real trueDelta = 0.5 * (newCost1 - newCost2); real diff = (1e-20 + trueDelta) / (1e-20 + delta) - 1; @@ -575,7 +575,7 @@ real Trainer::calcGradient(const DataBatch& dataBatch, trainerInternal_.getGradientMachine()->forwardBackward( inArgs, &outArgs, PASS_TRAIN); - real cost = Argument::sumCosts(outArgs); + real cost = Argument::sum(outArgs); offset = 0; for (auto& para : parameters) { diff --git a/paddle/trainer/TrainerInternal.cpp b/paddle/trainer/TrainerInternal.cpp index f3b465b444167d4624a5e99c30e1257eda53ca2c..4c5d4a0913aaf3a9932b3d67806378ece4245304 100644 --- a/paddle/trainer/TrainerInternal.cpp +++ b/paddle/trainer/TrainerInternal.cpp @@ -134,7 +134,7 @@ void TrainerInternal::trainOneBatch(int64_t batchId, real cost = 0; { REGISTER_TIMER("sumCost"); - cost = Argument::sumCosts(*outArgs); + cost = Argument::sum(*outArgs); } if (batchId % intconfig_->log_period == 0) { diff --git a/python/paddle/trainer/PyDataProvider2.py b/python/paddle/trainer/PyDataProvider2.py index bd24c68b6fe88eab03c814f8cac70db3880316f4..4e3c4db853205bb12272e86295784a6069483ffe 100644 --- a/python/paddle/trainer/PyDataProvider2.py +++ b/python/paddle/trainer/PyDataProvider2.py @@ -65,14 +65,18 @@ def sparse_value_slot(dim, seq_type=SequenceType.NO_SEQUENCE): return InputType(dim, seq_type, DataType.SparseValue) -def index_slot(dim, seq_type=SequenceType.NO_SEQUENCE): - return InputType(dim, seq_type, DataType.Index) +def index_slot(value_range, seq_type=SequenceType.NO_SEQUENCE): + """Data type of integer. + :param value_range: range of this integer. + """ + return InputType(value_range, seq_type, DataType.Index) dense_vector = dense_slot sparse_binary_vector = sparse_non_value_slot sparse_vector = sparse_value_slot integer_value = index_slot +integer_value.__doc__ = index_slot.__doc__ def dense_vector_sequence(dim): @@ -99,8 +103,11 @@ def sparse_vector_sub_sequence(dim): return sparse_vector(dim, seq_type=SequenceType.SUB_SEQUENCE) -def integer_value_sequence(dim): - return integer_value(dim, seq_type=SequenceType.SEQUENCE) +def integer_value_sequence(value_range): + """Data type of a sequence of integer. + :param value_range: range of each element. + """ + return integer_value(value_range, seq_type=SequenceType.SEQUENCE) def integer_value_sub_sequence(dim): @@ -108,6 +115,7 @@ def integer_value_sub_sequence(dim): integer_sequence = integer_value_sequence +integer_sequence.__doc__ = integer_value_sequence.__doc__ class SingleSlotWrapper(object): diff --git a/python/paddle/v2/__init__.py b/python/paddle/v2/__init__.py index 8ab8cd2f85d5d7bcf86c2f57b350dfcd99177b69..f663ef735d6424c45815a73d112d135be0dc5f8e 100644 --- a/python/paddle/v2/__init__.py +++ b/python/paddle/v2/__init__.py @@ -25,14 +25,14 @@ from . import dataset from . import reader import attr import pooling -import inferencer +import inference import networks import py_paddle.swig_paddle as api __all__ = [ 'optimizer', 'layer', 'activation', 'parameters', 'init', 'trainer', 'event', 'data_type', 'attr', 'pooling', 'data_feeder', 'dataset', 'reader', - 'topology', 'networks', 'inferencer', 'infer' + 'topology', 'networks', 'infer' ] @@ -44,4 +44,4 @@ def init(**kwargs): api.initPaddle(*args) -infer = inferencer.infer +infer = inference.infer diff --git a/python/paddle/v2/dataset/__init__.py b/python/paddle/v2/dataset/__init__.py index a1b21bab3bac8b304abb4ae292b1c1e9f3e719de..82f11a7c41149c2231130dc7c2205debb643aa89 100644 --- a/python/paddle/v2/dataset/__init__.py +++ b/python/paddle/v2/dataset/__init__.py @@ -18,5 +18,10 @@ import imdb import cifar import movielens import conll05 +import uci_housing +import sentiment -__all__ = ['mnist', 'imikolov', 'imdb', 'cifar', 'movielens', 'conll05'] +__all__ = [ + 'mnist', 'imikolov', 'imdb', 'cifar', 'movielens', 'conll05', 'sentiment' + 'uci_housing' +] diff --git a/python/paddle/v2/dataset/common.py b/python/paddle/v2/dataset/common.py index 397c9e66d495431f412c22b9b1d19ee32257b2dd..3021b68ddb02ecaa874e21681796c0912ad4cc06 100644 --- a/python/paddle/v2/dataset/common.py +++ b/python/paddle/v2/dataset/common.py @@ -16,6 +16,7 @@ import requests import hashlib import os import shutil +import sys __all__ = ['DATA_HOME', 'download', 'md5file'] @@ -41,9 +42,24 @@ def download(url, module_name, md5sum): filename = os.path.join(dirname, url.split('/')[-1]) if not (os.path.exists(filename) and md5file(filename) == md5sum): + print "Cache file %s not found, downloading %s" % (filename, url) r = requests.get(url, stream=True) - with open(filename, 'w') as f: - shutil.copyfileobj(r.raw, f) + total_length = r.headers.get('content-length') + + if total_length is None: + with open(filename, 'w') as f: + shutil.copyfileobj(r.raw, f) + else: + with open(filename, 'w') as f: + dl = 0 + total_length = int(total_length) + for data in r.iter_content(chunk_size=4096): + dl += len(data) + f.write(data) + done = int(50 * dl / total_length) + sys.stdout.write("\r[%s%s]" % ('=' * done, + ' ' * (50 - done))) + sys.stdout.flush() return filename diff --git a/python/paddle/v2/dataset/imdb.py b/python/paddle/v2/dataset/imdb.py index ffd7d89049358e979db762af07701e010f40dc6e..f27756a38a9cd809fdaaf92e7f8a72b681915fc8 100644 --- a/python/paddle/v2/dataset/imdb.py +++ b/python/paddle/v2/dataset/imdb.py @@ -116,3 +116,8 @@ def test(word_idx): return reader_creator( re.compile("aclImdb/test/pos/.*\.txt$"), re.compile("aclImdb/test/neg/.*\.txt$"), word_idx, 1000) + + +def word_dict(): + return build_dict( + re.compile("aclImdb/((train)|(test))/((pos)|(neg))/.*\.txt$"), 150) diff --git a/python/paddle/v2/dataset/imikolov.py b/python/paddle/v2/dataset/imikolov.py index 285d3eaca8317c78dc84e99b4d524a0f4872c687..deb556942d9b0490ffab8cef90aae8f365652129 100644 --- a/python/paddle/v2/dataset/imikolov.py +++ b/python/paddle/v2/dataset/imikolov.py @@ -17,7 +17,7 @@ imikolov's simple dataset: http://www.fit.vutbr.cz/~imikolov/rnnlm/ import paddle.v2.dataset.common import tarfile -__all__ = ['train', 'test'] +__all__ = ['train', 'test', 'build_dict'] URL = 'http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz' MD5 = '30177ea32e27c525793142b6bf2c8e2d' @@ -37,7 +37,9 @@ def word_count(f, word_freq=None): return word_freq -def build_dict(train_filename, test_filename): +def build_dict(): + train_filename = './simple-examples/data/ptb.train.txt' + test_filename = './simple-examples/data/ptb.valid.txt' with tarfile.open( paddle.v2.dataset.common.download( paddle.v2.dataset.imikolov.URL, 'imikolov', @@ -45,27 +47,22 @@ def build_dict(train_filename, test_filename): trainf = tf.extractfile(train_filename) testf = tf.extractfile(test_filename) word_freq = word_count(testf, word_count(trainf)) + if '' in word_freq: + # remove for now, since we will set it as last index + del word_freq[''] TYPO_FREQ = 50 word_freq = filter(lambda x: x[1] > TYPO_FREQ, word_freq.items()) - dictionary = sorted(word_freq, key=lambda x: (-x[1], x[0])) - words, _ = list(zip(*dictionary)) + word_freq_sorted = sorted(word_freq, key=lambda x: (-x[1], x[0])) + words, _ = list(zip(*word_freq_sorted)) word_idx = dict(zip(words, xrange(len(words)))) word_idx[''] = len(words) return word_idx -word_idx = {} - - -def reader_creator(filename, n): - global word_idx - if len(word_idx) == 0: - word_idx = build_dict('./simple-examples/data/ptb.train.txt', - './simple-examples/data/ptb.valid.txt') - +def reader_creator(filename, word_idx, n): def reader(): with tarfile.open( paddle.v2.dataset.common.download( @@ -84,9 +81,9 @@ def reader_creator(filename, n): return reader -def train(n): - return reader_creator('./simple-examples/data/ptb.train.txt', n) +def train(word_idx, n): + return reader_creator('./simple-examples/data/ptb.train.txt', word_idx, n) -def test(n): - return reader_creator('./simple-examples/data/ptb.valid.txt', n) +def test(word_idx, n): + return reader_creator('./simple-examples/data/ptb.valid.txt', word_idx, n) diff --git a/python/paddle/v2/dataset/sentiment.py b/python/paddle/v2/dataset/sentiment.py new file mode 100644 index 0000000000000000000000000000000000000000..cbd08fa73684be42e8d8d2eb7b684d66894d7761 --- /dev/null +++ b/python/paddle/v2/dataset/sentiment.py @@ -0,0 +1,126 @@ +# /usr/bin/env python +# -*- coding:utf-8 -*- + +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +The script fetch and preprocess movie_reviews data set + +that provided by NLTK +""" + +import common +import collections +import nltk +import numpy as np +from itertools import chain +from nltk.corpus import movie_reviews + +__all__ = ['train', 'test', 'get_word_dict'] +NUM_TRAINING_INSTANCES = 1600 +NUM_TOTAL_INSTANCES = 2000 + + +def download_data_if_not_yet(): + """ + Download the data set, if the data set is not download. + """ + try: + # make sure that nltk can find the data + if common.DATA_HOME not in nltk.data.path: + nltk.data.path.append(common.DATA_HOME) + movie_reviews.categories() + except LookupError: + print "Downloading movie_reviews data set, please wait....." + nltk.download('movie_reviews', download_dir=common.DATA_HOME) + print "Download data set success....." + print "Path is " + nltk.data.find('corpora/movie_reviews').path + + +def get_word_dict(): + """ + Sorted the words by the frequency of words which occur in sample + :return: + words_freq_sorted + """ + words_freq_sorted = list() + word_freq_dict = collections.defaultdict(int) + download_data_if_not_yet() + + for category in movie_reviews.categories(): + for field in movie_reviews.fileids(category): + for words in movie_reviews.words(field): + word_freq_dict[words] += 1 + words_sort_list = word_freq_dict.items() + words_sort_list.sort(cmp=lambda a, b: b[1] - a[1]) + for index, word in enumerate(words_sort_list): + words_freq_sorted.append((word[0], index)) + return words_freq_sorted + + +def sort_files(): + """ + Sorted the sample for cross reading the sample + :return: + files_list + """ + files_list = list() + neg_file_list = movie_reviews.fileids('neg') + pos_file_list = movie_reviews.fileids('pos') + files_list = list(chain.from_iterable(zip(neg_file_list, pos_file_list))) + return files_list + + +def load_sentiment_data(): + """ + Load the data set + :return: + data_set + """ + data_set = list() + download_data_if_not_yet() + words_ids = dict(get_word_dict()) + for sample_file in sort_files(): + words_list = list() + category = 0 if 'neg' in sample_file else 1 + for word in movie_reviews.words(sample_file): + words_list.append(words_ids[word.lower()]) + data_set.append((words_list, category)) + return data_set + + +def reader_creator(data): + """ + Reader creator, generate an iterator for data set + :param data: + train data set or test data set + """ + for each in data: + yield each[0], each[1] + + +def train(): + """ + Default train set reader creator + """ + data_set = load_sentiment_data() + return reader_creator(data_set[0:NUM_TRAINING_INSTANCES]) + + +def test(): + """ + Default test set reader creator + """ + data_set = load_sentiment_data() + return reader_creator(data_set[NUM_TRAINING_INSTANCES:]) diff --git a/python/paddle/v2/dataset/tests/imikolov_test.py b/python/paddle/v2/dataset/tests/imikolov_test.py index 9b1748eaaa7f913a6b94f2087a8089fb998570aa..009e55243a594e5e235c36fb0223ec70754d17f3 100644 --- a/python/paddle/v2/dataset/tests/imikolov_test.py +++ b/python/paddle/v2/dataset/tests/imikolov_test.py @@ -1,6 +1,8 @@ import paddle.v2.dataset.imikolov import unittest +WORD_DICT = paddle.v2.dataset.imikolov.build_dict() + class TestMikolov(unittest.TestCase): def check_reader(self, reader, n): @@ -9,11 +11,15 @@ class TestMikolov(unittest.TestCase): def test_train(self): n = 5 - self.check_reader(paddle.v2.dataset.imikolov.train(n), n) + self.check_reader(paddle.v2.dataset.imikolov.train(WORD_DICT, n), n) def test_test(self): n = 5 - self.check_reader(paddle.v2.dataset.imikolov.test(n), n) + self.check_reader(paddle.v2.dataset.imikolov.test(WORD_DICT, n), n) + + def test_total(self): + _, idx = zip(*WORD_DICT.items()) + self.assertEqual(sorted(idx)[-1], len(WORD_DICT) - 1) if __name__ == '__main__': diff --git a/python/paddle/v2/dataset/tests/test_sentiment.py b/python/paddle/v2/dataset/tests/test_sentiment.py new file mode 100644 index 0000000000000000000000000000000000000000..407405290734609059c1767600748d530e8a13a6 --- /dev/null +++ b/python/paddle/v2/dataset/tests/test_sentiment.py @@ -0,0 +1,55 @@ +# /usr/bin/env python +# -*- coding:utf-8 -*- + +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import nltk +import paddle.v2.dataset.sentiment as st +from nltk.corpus import movie_reviews + + +class TestSentimentMethods(unittest.TestCase): + def test_get_word_dict(self): + word_dict = st.get_word_dict()[0:10] + test_word_list = [(u',', 0), (u'the', 1), (u'.', 2), (u'a', 3), + (u'and', 4), (u'of', 5), (u'to', 6), (u"'", 7), + (u'is', 8), (u'in', 9)] + for idx, each in enumerate(word_dict): + self.assertEqual(each, test_word_list[idx]) + self.assertTrue("/root/.cache/paddle/dataset" in nltk.data.path) + + def test_sort_files(self): + last_label = '' + for sample_file in st.sort_files(): + current_label = sample_file.split("/")[0] + self.assertNotEqual(current_label, last_label) + last_label = current_label + + def test_data_set(self): + data_set = st.load_sentiment_data() + last_label = -1 + for each in st.test(): + self.assertNotEqual(each[1], last_label) + last_label = each[1] + self.assertEqual(len(data_set), st.NUM_TOTAL_INSTANCES) + self.assertEqual(len(list(st.train())), st.NUM_TRAINING_INSTANCES) + self.assertEqual( + len(list(st.test())), + (st.NUM_TOTAL_INSTANCES - st.NUM_TRAINING_INSTANCES)) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/v2/dataset/uci_housing.py b/python/paddle/v2/dataset/uci_housing.py new file mode 100644 index 0000000000000000000000000000000000000000..b5a0537af66a3fae4e1b267ae25441a6cb75416b --- /dev/null +++ b/python/paddle/v2/dataset/uci_housing.py @@ -0,0 +1,86 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import os +from common import download + +__all__ = ['train', 'test'] + +URL = 'https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.data' +MD5 = 'd4accdce7a25600298819f8e28e8d593' +feature_names = [ + 'CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', + 'PTRATIO', 'B', 'LSTAT' +] + +UCI_TRAIN_DATA = None +UCI_TEST_DATA = None + + +def feature_range(maximums, minimums): + import matplotlib + matplotlib.use('Agg') + import matplotlib.pyplot as plt + fig, ax = plt.subplots() + feature_num = len(maximums) + ax.bar(range(feature_num), maximums - minimums, color='r', align='center') + ax.set_title('feature scale') + plt.xticks(range(feature_num), feature_names) + plt.xlim([-1, feature_num]) + fig.set_figheight(6) + fig.set_figwidth(10) + if not os.path.exists('./image'): + os.makedirs('./image') + fig.savefig('image/ranges.png', dpi=48) + plt.close(fig) + + +def load_data(filename, feature_num=14, ratio=0.8): + global UCI_TRAIN_DATA, UCI_TEST_DATA + if UCI_TRAIN_DATA is not None and UCI_TEST_DATA is not None: + return + + data = np.fromfile(filename, sep=' ') + data = data.reshape(data.shape[0] / feature_num, feature_num) + maximums, minimums, avgs = data.max(axis=0), data.min(axis=0), data.sum( + axis=0) / data.shape[0] + feature_range(maximums[:-1], minimums[:-1]) + for i in xrange(feature_num - 1): + data[:, i] = (data[:, i] - avgs[i]) / (maximums[i] - minimums[i]) + offset = int(data.shape[0] * ratio) + UCI_TRAIN_DATA = data[:offset] + UCI_TEST_DATA = data[offset:] + + +def train(): + global UCI_TRAIN_DATA + load_data(download(URL, 'uci_housing', MD5)) + + def reader(): + for d in UCI_TRAIN_DATA: + yield d[:-1], d[-1:] + + return reader + + +def test(): + global UCI_TEST_DATA + load_data(download(URL, 'uci_housing', MD5)) + + def reader(): + for d in UCI_TEST_DATA: + yield d[:-1], d[-1:] + + return reader diff --git a/python/paddle/v2/event.py b/python/paddle/v2/event.py index a78bcf076cc65e0dfdfc5760e099900418162f35..a429e36b63c9e812332673b66f4d8b99f3303cf8 100644 --- a/python/paddle/v2/event.py +++ b/python/paddle/v2/event.py @@ -34,8 +34,9 @@ class WithMetric(object): class TestResult(WithMetric): - def __init__(self, evaluator): + def __init__(self, evaluator, cost): super(TestResult, self).__init__(evaluator) + self.cost = cost class BeginPass(object): diff --git a/python/paddle/v2/inferencer.py b/python/paddle/v2/inference.py similarity index 98% rename from python/paddle/v2/inferencer.py rename to python/paddle/v2/inference.py index ac03b016c9b8bfbc586072855402ed3a373e9b54..476fd3fa4523a77709f68c73c73e6851e04064aa 100644 --- a/python/paddle/v2/inferencer.py +++ b/python/paddle/v2/inference.py @@ -5,7 +5,7 @@ from data_feeder import DataFeeder import itertools import numpy -__all__ = ['Inference', 'infer'] +__all__ = ['infer'] class Inference(object): diff --git a/python/paddle/v2/layer.py b/python/paddle/v2/layer.py index 2f55611aaa1d3ae22f5d7f184b38e622271881ea..010773ddbd96d4226cccc1a63cfc133b78bdcffe 100644 --- a/python/paddle/v2/layer.py +++ b/python/paddle/v2/layer.py @@ -262,7 +262,7 @@ class StaticInputV2(object): self.input = input self.is_seq = is_seq self.size = size - # TODO(qiaolongfei): add size + # TODO(add size check) # assert input.size is not None or size is not None diff --git a/python/paddle/v2/tests/test_data_feeder.py b/python/paddle/v2/tests/test_data_feeder.py index ab2bc5df76cd839b5b0184e9559f0c2e03baf38b..71eb3bf31425c22b47accc11c9550042e077ef12 100644 --- a/python/paddle/v2/tests/test_data_feeder.py +++ b/python/paddle/v2/tests/test_data_feeder.py @@ -110,14 +110,14 @@ class DataFeederTest(unittest.TestCase): self.assertAlmostEqual(value.all(), w[i].all()) def test_integer(self): - dim = 100 + value_range = 100 batch_size = 32 index = [] for i in xrange(batch_size): each_sample = [] - each_sample.append(np.random.randint(dim)) + each_sample.append(np.random.randint(value_range)) index.append(each_sample) - feeder = DataFeeder([('input', data_type.integer_value(dim))], + feeder = DataFeeder([('input', data_type.integer_value(value_range))], {'input': 0}) arg = feeder(index) output = arg.getSlotIds(0).copyToNumpyArray() @@ -125,7 +125,7 @@ class DataFeederTest(unittest.TestCase): self.assertEqual(output.all(), index.flatten().all()) def test_integer_sequence(self): - dim = 10000 + value_range = 10000 batch_size = 32 start = [0] data = [] @@ -133,11 +133,12 @@ class DataFeederTest(unittest.TestCase): each_sample = [] each_sample.append( self.sparse_binary_reader( - dim, 30, non_empty=True)) + value_range, 30, non_empty=True)) data.append(each_sample) start.append(len(each_sample[0]) + start[-1]) - feeder = DataFeeder([('input', data_type.integer_value_sequence(dim))], - {'input': 0}) + feeder = DataFeeder( + [('input', data_type.integer_value_sequence(value_range))], + {'input': 0}) arg = feeder(data) output_data = arg.getSlotIds(0).copyToNumpyArray() output_start = arg.getSlotSequenceStartPositions(0).copyToNumpyArray() diff --git a/python/paddle/v2/topology.py b/python/paddle/v2/topology.py index 4c211254319bbdf46b02a2cee56b6a98b01819a2..f0679c5675b0c0f24f28f3df22efd4eb51ccbb3a 100644 --- a/python/paddle/v2/topology.py +++ b/python/paddle/v2/topology.py @@ -17,6 +17,7 @@ import collections from paddle.proto.ModelConfig_pb2 import ModelConfig import layer as v2_layer +from layer import WithExtraParent __all__ = ['Topology'] @@ -40,7 +41,10 @@ def __bfs_travel__(callback, *layers): __break__ = callback(each_layer) if __break__: return - __bfs_travel__(callback, *each_layer.__parent_layers__.values()) + __layers__ = each_layer.__parent_layers__.values() + if isinstance(each_layer, WithExtraParent): + __layers__ = __layers__ + each_layer.extra_parent() + __bfs_travel__(callback, *__layers__) class Topology(object): diff --git a/python/paddle/v2/trainer.py b/python/paddle/v2/trainer.py index abaad1d0213c63f92a72b30158d41fd41119d078..8bcdd122b30bde91f652f351dddc27734fdf33cf 100644 --- a/python/paddle/v2/trainer.py +++ b/python/paddle/v2/trainer.py @@ -8,7 +8,7 @@ from . import event as v2_event from . import optimizer as v2_optimizer from . import parameters as v2_parameters -__all__ = ['ITrainer', 'SGD'] +__all__ = ['SGD'] def default_event_handler(event): @@ -22,26 +22,7 @@ def default_event_handler(event): pass -class ITrainer(object): - """ - The interface of Trainer. The only exposed method is `train`. - """ - - def train(self, reader, topology, parameters, event_handler=None): - """ - train method. - - :param reader: - :param topology: - :param parameters: - :param event_handler: - :return: - """ - - raise NotImplementedError() - - -class SGD(ITrainer): +class SGD(): def __init__(self, cost, parameters, update_equation): """ Simple SGD Trainer. @@ -120,10 +101,8 @@ class SGD(ITrainer): for each_param in self.__gradient_machine__.getNonStaticParameters( ): updater.update(each_param) - # Get cost. We use numpy to calculate total cost for this batch. - cost_vec = out_args.getSlotValue(0) - cost_vec = cost_vec.copyToNumpyMat() - cost = cost_vec.sum() / len(data_batch) + cost_sum = out_args.sumCosts() + cost = cost_sum / len(data_batch) updater.finishBatch(cost) batch_evaluator.finish() event_handler( @@ -152,13 +131,18 @@ class SGD(ITrainer): evaluator = self.__gradient_machine__.makeEvaluator() out_args = api.Arguments.createArguments(0) evaluator.start() + total_cost = 0 + num_samples = 0.0 for data_batch in reader(): + num_samples += len(data_batch) self.__gradient_machine__.forward( feeder(data_batch), out_args, api.PASS_TEST) + total_cost += out_args.sumCosts() self.__gradient_machine__.eval(evaluator) evaluator.finish() - return v2_event.TestResult(evaluator=evaluator) + return v2_event.TestResult( + evaluator=evaluator, cost=total_cost / num_samples) def __check_train_args__(reader, event_handler, **kwargs):