diff --git a/.travis.yml b/.travis.yml index 28d1f51be7107b57594eb8aacab2e82d2dec624a..5a7f45a748ac7e81f3f90c245bcf2cd84c4e9027 100644 --- a/.travis.yml +++ b/.travis.yml @@ -4,22 +4,14 @@ cache: - $HOME/third_party - $HOME/.ccache - $HOME/.cache/pip - - $HOME/Library/Caches/Homebrew sudo: required dist: trusty os: - linux - - osx env: - JOB=DOCS - JOB=BUILD_AND_TEST - JOB=PRE_COMMIT -matrix: - exclude: - - os: osx - env: JOB=DOCS # Only generate documentation in linux. - - os: osx - env: JOB=PRE_COMMIT # Only check pre-commit hook in linux addons: apt: @@ -53,7 +45,6 @@ before_install: fi fi fi - - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then paddle/scripts/travis/before_install.osx.sh; fi - if [[ "$JOB" == "PRE_COMMIT" ]]; then sudo ln -s /usr/bin/clang-format-3.8 /usr/bin/clang-format; fi # Paddle is using protobuf 3.1 currently. Protobuf 3.2 breaks the compatibility. So we specify the python # protobuf version. diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake index 84f459033f06f89d3b150317793c7e62274468b2..26da7e8e384bafdcbcd1a358c39cc6eb167b067e 100644 --- a/cmake/external/protobuf.cmake +++ b/cmake/external/protobuf.cmake @@ -14,46 +14,50 @@ INCLUDE(ExternalProject) -SET(PROTOBUF_SOURCES_DIR ${THIRD_PARTY_PATH}/protobuf) -SET(PROTOBUF_INSTALL_DIR ${THIRD_PARTY_PATH}/install/protobuf) -SET(PROTOBUF_INCLUDE_DIR "${PROTOBUF_INSTALL_DIR}/include" CACHE PATH "protobuf include directory." FORCE) +FIND_PACKAGE(Protobuf) -INCLUDE_DIRECTORIES(${PROTOBUF_INCLUDE_DIR}) +IF(NOT PROTOBUF_FOUND) + SET(PROTOBUF_SOURCES_DIR ${THIRD_PARTY_PATH}/protobuf) + SET(PROTOBUF_INSTALL_DIR ${THIRD_PARTY_PATH}/install/protobuf) + SET(PROTOBUF_INCLUDE_DIR "${PROTOBUF_INSTALL_DIR}/include" CACHE PATH "protobuf include directory." FORCE) + + IF(WIN32) + SET(PROTOBUF_LITE_LIBRARY + "${PROTOBUF_INSTALL_DIR}/lib/libprotobuf-lite.lib" CACHE FILEPATH "protobuf lite library." FORCE) + SET(PROTOBUF_LIBRARY + "${PROTOBUF_INSTALL_DIR}/lib/libprotobuf.lib" CACHE FILEPATH "protobuf library." FORCE) + SET(PROTOBUF_PROTOC_LIBRARY + "${PROTOBUF_INSTALL_DIR}/lib/libprotoc.lib" CACHE FILEPATH "protoc library." FORCE) + SET(PROTOBUF_PROTOC_EXECUTABLE "${PROTOBUF_INSTALL_DIR}/bin/protoc.exe" CACHE FILEPATH "protobuf executable." FORCE) + ELSE(WIN32) + SET(PROTOBUF_LITE_LIBRARY + "${PROTOBUF_INSTALL_DIR}/lib/libprotobuf-lite.a" CACHE FILEPATH "protobuf lite library." FORCE) + SET(PROTOBUF_LIBRARY + "${PROTOBUF_INSTALL_DIR}/lib/libprotobuf.a" CACHE FILEPATH "protobuf library." FORCE) + SET(PROTOBUF_PROTOC_LIBRARY + "${PROTOBUF_INSTALL_DIR}/lib/libprotoc.a" CACHE FILEPATH "protoc library." FORCE) + SET(PROTOBUF_PROTOC_EXECUTABLE "${PROTOBUF_INSTALL_DIR}/bin/protoc" CACHE FILEPATH "protobuf executable." FORCE) + ENDIF(WIN32) -IF(WIN32) - SET(PROTOBUF_LITE_LIBRARY - "${PROTOBUF_INSTALL_DIR}/lib/libprotobuf-lite.lib" CACHE FILEPATH "protobuf lite library." FORCE) - SET(PROTOBUF_LIBRARY - "${PROTOBUF_INSTALL_DIR}/lib/libprotobuf.lib" CACHE FILEPATH "protobuf library." FORCE) - SET(PROTOBUF_PROTOC_LIBRARY - "${PROTOBUF_INSTALL_DIR}/lib/libprotoc.lib" CACHE FILEPATH "protoc library." FORCE) - SET(PROTOBUF_PROTOC_EXECUTABLE "${PROTOBUF_INSTALL_DIR}/bin/protoc.exe" CACHE FILEPATH "protobuf executable." FORCE) -ELSE(WIN32) - SET(PROTOBUF_LITE_LIBRARY - "${PROTOBUF_INSTALL_DIR}/lib/libprotobuf-lite.a" CACHE FILEPATH "protobuf lite library." FORCE) - SET(PROTOBUF_LIBRARY - "${PROTOBUF_INSTALL_DIR}/lib/libprotobuf.a" CACHE FILEPATH "protobuf library." FORCE) - SET(PROTOBUF_PROTOC_LIBRARY - "${PROTOBUF_INSTALL_DIR}/lib/libprotoc.a" CACHE FILEPATH "protoc library." FORCE) - SET(PROTOBUF_PROTOC_EXECUTABLE "${PROTOBUF_INSTALL_DIR}/bin/protoc" CACHE FILEPATH "protobuf executable." FORCE) -ENDIF(WIN32) + ExternalProject_Add( + protobuf + ${EXTERNAL_PROJECT_LOG_ARGS} + PREFIX ${PROTOBUF_SOURCES_DIR} + UPDATE_COMMAND "" + DEPENDS zlib + GIT_REPOSITORY "https://github.com/google/protobuf.git" + GIT_TAG "9f75c5aa851cd877fb0d93ccc31b8567a6706546" + CONFIGURE_COMMAND + ${CMAKE_COMMAND} ${PROTOBUF_SOURCES_DIR}/src/protobuf/cmake + -Dprotobuf_BUILD_TESTS=OFF + -DZLIB_ROOT:FILEPATH=${ZLIB_ROOT} + -DCMAKE_POSITION_INDEPENDENT_CODE=ON + -DCMAKE_BUILD_TYPE=Release + -DCMAKE_INSTALL_PREFIX=${PROTOBUF_INSTALL_DIR} + -DCMAKE_INSTALL_LIBDIR=lib + ) -ExternalProject_Add( - protobuf - ${EXTERNAL_PROJECT_LOG_ARGS} - PREFIX ${PROTOBUF_SOURCES_DIR} - UPDATE_COMMAND "" - DEPENDS zlib - GIT_REPOSITORY "https://github.com/google/protobuf.git" - GIT_TAG "9f75c5aa851cd877fb0d93ccc31b8567a6706546" - CONFIGURE_COMMAND - ${CMAKE_COMMAND} ${PROTOBUF_SOURCES_DIR}/src/protobuf/cmake - -Dprotobuf_BUILD_TESTS=OFF - -DZLIB_ROOT:FILEPATH=${ZLIB_ROOT} - -DCMAKE_POSITION_INDEPENDENT_CODE=ON - -DCMAKE_BUILD_TYPE=Release - -DCMAKE_INSTALL_PREFIX=${PROTOBUF_INSTALL_DIR} - -DCMAKE_INSTALL_LIBDIR=lib -) + LIST(APPEND external_project_dependencies protobuf) +ENDIF(NOT PROTOBUF_FOUND) -LIST(APPEND external_project_dependencies protobuf) +INCLUDE_DIRECTORIES(${PROTOBUF_INCLUDE_DIR}) diff --git a/cmake/external/python.cmake b/cmake/external/python.cmake index 6372a9a768e580f74f837ccb6c57d4f4395eb779..0accf1a8dd83560324716f0f4936be56dd7a9f1b 100644 --- a/cmake/external/python.cmake +++ b/cmake/external/python.cmake @@ -221,7 +221,3 @@ ENDIF(PYTHONLIBS_FOUND AND PYTHONINTERP_FOUND) INCLUDE_DIRECTORIES(${PYTHON_INCLUDE_DIR}) INCLUDE_DIRECTORIES(${PYTHON_NUMPY_INCLUDE_DIR}) - -MESSAGE("[Paddle] Python Executable: ${PYTHON_EXECUTABLE}") -MESSAGE("[Paddle] Python Include: ${PYTHON_INCLUDE_DIRS}") -MESSAGE("[Paddle] Python Libraries: ${PYTHON_LIBRARIES}") diff --git a/demo/mnist/api_train_v2.py b/demo/mnist/api_train_v2.py index 06beb7024d1fd07dc327cb4c09d74e1b89a7b8ff..575a32b3222a83afa0f01bef59037dca102773e7 100644 --- a/demo/mnist/api_train_v2.py +++ b/demo/mnist/api_train_v2.py @@ -30,10 +30,11 @@ def main(): result = trainer.test(reader=paddle.reader.batched( paddle.dataset.mnist.test(), batch_size=256)) - print "Pass %d, Batch %d, Cost %f, %s, Testing metrics %s" % ( - event.pass_id, event.batch_id, event.cost, event.metrics, - result.metrics) - + print "Pass %d, Batch %d, Cost %.2f, %s\n" \ + "Testing cost %.2f metrics %s" % ( + event.pass_id, event.batch_id, event.cost, + event.metrics, + result.cost, result.metrics) else: pass diff --git a/demo/semantic_role_labeling/api_train_v2.py b/demo/semantic_role_labeling/api_train_v2.py new file mode 100644 index 0000000000000000000000000000000000000000..15db922b97abc5ae79f095edfd632604eec8ab94 --- /dev/null +++ b/demo/semantic_role_labeling/api_train_v2.py @@ -0,0 +1,190 @@ +import sys +import math +import numpy as np +import paddle.v2 as paddle +import paddle.v2.dataset.conll05 as conll05 + + +def db_lstm(): + word_dict, verb_dict, label_dict = conll05.get_dict() + word_dict_len = len(word_dict) + label_dict_len = len(label_dict) + pred_len = len(verb_dict) + + mark_dict_len = 2 + word_dim = 32 + mark_dim = 5 + hidden_dim = 512 + depth = 8 + + #8 features + def d_type(size): + return paddle.data_type.integer_value_sequence(size) + + word = paddle.layer.data(name='word_data', type=d_type(word_dict_len)) + predicate = paddle.layer.data(name='verb_data', type=d_type(pred_len)) + + ctx_n2 = paddle.layer.data(name='ctx_n2_data', type=d_type(word_dict_len)) + ctx_n1 = paddle.layer.data(name='ctx_n1_data', type=d_type(word_dict_len)) + ctx_0 = paddle.layer.data(name='ctx_0_data', type=d_type(word_dict_len)) + ctx_p1 = paddle.layer.data(name='ctx_p1_data', type=d_type(word_dict_len)) + ctx_p2 = paddle.layer.data(name='ctx_p2_data', type=d_type(word_dict_len)) + mark = paddle.layer.data(name='mark_data', type=d_type(mark_dict_len)) + + target = paddle.layer.data(name='target', type=d_type(label_dict_len)) + + default_std = 1 / math.sqrt(hidden_dim) / 3.0 + + emb_para = paddle.attr.Param(name='emb', initial_std=0., learning_rate=0.) + std_0 = paddle.attr.Param(initial_std=0.) + std_default = paddle.attr.Param(initial_std=default_std) + + predicate_embedding = paddle.layer.embedding( + size=word_dim, + input=predicate, + param_attr=paddle.attr.Param( + name='vemb', initial_std=default_std)) + mark_embedding = paddle.layer.embedding( + size=mark_dim, input=mark, param_attr=std_0) + + word_input = [word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2] + emb_layers = [ + paddle.layer.embedding( + size=word_dim, input=x, param_attr=emb_para) for x in word_input + ] + emb_layers.append(predicate_embedding) + emb_layers.append(mark_embedding) + + hidden_0 = paddle.layer.mixed( + size=hidden_dim, + bias_attr=std_default, + input=[ + paddle.layer.full_matrix_projection( + input=emb, param_attr=std_default) for emb in emb_layers + ]) + + mix_hidden_lr = 1e-3 + lstm_para_attr = paddle.attr.Param(initial_std=0.0, learning_rate=1.0) + hidden_para_attr = paddle.attr.Param( + initial_std=default_std, learning_rate=mix_hidden_lr) + + lstm_0 = paddle.layer.lstmemory( + input=hidden_0, + act=paddle.activation.Relu(), + gate_act=paddle.activation.Sigmoid(), + state_act=paddle.activation.Sigmoid(), + bias_attr=std_0, + param_attr=lstm_para_attr) + + #stack L-LSTM and R-LSTM with direct edges + input_tmp = [hidden_0, lstm_0] + + for i in range(1, depth): + mix_hidden = paddle.layer.mixed( + size=hidden_dim, + bias_attr=std_default, + input=[ + paddle.layer.full_matrix_projection( + input=input_tmp[0], param_attr=hidden_para_attr), + paddle.layer.full_matrix_projection( + input=input_tmp[1], param_attr=lstm_para_attr) + ]) + + lstm = paddle.layer.lstmemory( + input=mix_hidden, + act=paddle.activation.Relu(), + gate_act=paddle.activation.Sigmoid(), + state_act=paddle.activation.Sigmoid(), + reverse=((i % 2) == 1), + bias_attr=std_0, + param_attr=lstm_para_attr) + + input_tmp = [mix_hidden, lstm] + + feature_out = paddle.layer.mixed( + size=label_dict_len, + bias_attr=std_default, + input=[ + paddle.layer.full_matrix_projection( + input=input_tmp[0], param_attr=hidden_para_attr), + paddle.layer.full_matrix_projection( + input=input_tmp[1], param_attr=lstm_para_attr) + ], ) + + crf_cost = paddle.layer.crf(size=label_dict_len, + input=feature_out, + label=target, + param_attr=paddle.attr.Param( + name='crfw', + initial_std=default_std, + learning_rate=mix_hidden_lr)) + + crf_dec = paddle.layer.crf_decoding( + name='crf_dec_l', + size=label_dict_len, + input=feature_out, + label=target, + param_attr=paddle.attr.Param(name='crfw')) + + return crf_cost, crf_dec + + +def load_parameter(file_name, h, w): + with open(file_name, 'rb') as f: + f.read(16) # skip header. + return np.fromfile(f, dtype=np.float32).reshape(h, w) + + +def main(): + paddle.init(use_gpu=False, trainer_count=1) + + # define network topology + crf_cost, crf_dec = db_lstm() + + # create parameters + parameters = paddle.parameters.create([crf_cost, crf_dec]) + + # create optimizer + optimizer = paddle.optimizer.Momentum( + momentum=0, + learning_rate=2e-2, + regularization=paddle.optimizer.L2Regularization(rate=8e-4), + model_average=paddle.optimizer.ModelAverage( + average_window=0.5, max_average_window=10000), ) + + def event_handler(event): + if isinstance(event, paddle.event.EndIteration): + if event.batch_id % 100 == 0: + print "Pass %d, Batch %d, Cost %f, %s" % ( + event.pass_id, event.batch_id, event.cost, event.metrics) + + trainer = paddle.trainer.SGD(cost=crf_cost, + parameters=parameters, + update_equation=optimizer) + parameters.set('emb', load_parameter(conll05.get_embedding(), 44068, 32)) + + trn_reader = paddle.reader.batched( + paddle.reader.shuffle( + conll05.test(), buf_size=8192), batch_size=10) + + reader_dict = { + 'word_data': 0, + 'ctx_n2_data': 1, + 'ctx_n1_data': 2, + 'ctx_0_data': 3, + 'ctx_p1_data': 4, + 'ctx_p2_data': 5, + 'verb_data': 6, + 'mark_data': 7, + 'target': 8 + } + + trainer.train( + reader=trn_reader, + event_handler=event_handler, + num_passes=10000, + reader_dict=reader_dict) + + +if __name__ == '__main__': + main() diff --git a/demo/sentiment/train_v2.py b/demo/sentiment/train_v2.py new file mode 100644 index 0000000000000000000000000000000000000000..0fa74948533b4362a7a9206e7a787cf217ca5ca2 --- /dev/null +++ b/demo/sentiment/train_v2.py @@ -0,0 +1,247 @@ +import sys +from os.path import join as join_path +import paddle.trainer_config_helpers.attrs as attrs +from paddle.trainer_config_helpers.poolings import MaxPooling +import paddle.v2.layer as layer +import paddle.v2.activation as activation +import paddle.v2.data_type as data_type +import paddle.v2.dataset.imdb as imdb +import paddle.v2 as paddle + + +def sequence_conv_pool(input, + input_size, + context_len, + hidden_size, + name=None, + context_start=None, + pool_type=None, + context_proj_layer_name=None, + context_proj_param_attr=False, + fc_layer_name=None, + fc_param_attr=None, + fc_bias_attr=None, + fc_act=None, + pool_bias_attr=None, + fc_attr=None, + context_attr=None, + pool_attr=None): + """ + Text convolution pooling layers helper. + + Text input => Context Projection => FC Layer => Pooling => Output. + + :param name: name of output layer(pooling layer name) + :type name: basestring + :param input: name of input layer + :type input: LayerOutput + :param context_len: context projection length. See + context_projection's document. + :type context_len: int + :param hidden_size: FC Layer size. + :type hidden_size: int + :param context_start: context projection length. See + context_projection's context_start. + :type context_start: int or None + :param pool_type: pooling layer type. See pooling_layer's document. + :type pool_type: BasePoolingType. + :param context_proj_layer_name: context projection layer name. + None if user don't care. + :type context_proj_layer_name: basestring + :param context_proj_param_attr: context projection parameter attribute. + None if user don't care. + :type context_proj_param_attr: ParameterAttribute or None. + :param fc_layer_name: fc layer name. None if user don't care. + :type fc_layer_name: basestring + :param fc_param_attr: fc layer parameter attribute. None if user don't care. + :type fc_param_attr: ParameterAttribute or None + :param fc_bias_attr: fc bias parameter attribute. False if no bias, + None if user don't care. + :type fc_bias_attr: ParameterAttribute or None + :param fc_act: fc layer activation type. None means tanh + :type fc_act: BaseActivation + :param pool_bias_attr: pooling layer bias attr. None if don't care. + False if no bias. + :type pool_bias_attr: ParameterAttribute or None. + :param fc_attr: fc layer extra attribute. + :type fc_attr: ExtraLayerAttribute + :param context_attr: context projection layer extra attribute. + :type context_attr: ExtraLayerAttribute + :param pool_attr: pooling layer extra attribute. + :type pool_attr: ExtraLayerAttribute + :return: output layer name. + :rtype: LayerOutput + """ + # Set Default Value to param + context_proj_layer_name = "%s_conv_proj" % name \ + if context_proj_layer_name is None else context_proj_layer_name + + with layer.mixed( + name=context_proj_layer_name, + size=input_size * context_len, + act=activation.Linear(), + layer_attr=context_attr) as m: + m += layer.context_projection( + input=input, + context_len=context_len, + context_start=context_start, + padding_attr=context_proj_param_attr) + + fc_layer_name = "%s_conv_fc" % name \ + if fc_layer_name is None else fc_layer_name + fl = layer.fc(name=fc_layer_name, + input=m, + size=hidden_size, + act=fc_act, + layer_attr=fc_attr, + param_attr=fc_param_attr, + bias_attr=fc_bias_attr) + + return layer.pooling( + name=name, + input=fl, + pooling_type=pool_type, + bias_attr=pool_bias_attr, + layer_attr=pool_attr) + + +def convolution_net(input_dim, + class_dim=2, + emb_dim=128, + hid_dim=128, + is_predict=False): + data = layer.data("word", data_type.integer_value_sequence(input_dim)) + emb = layer.embedding(input=data, size=emb_dim) + conv_3 = sequence_conv_pool( + input=emb, input_size=emb_dim, context_len=3, hidden_size=hid_dim) + conv_4 = sequence_conv_pool( + input=emb, input_size=emb_dim, context_len=4, hidden_size=hid_dim) + output = layer.fc(input=[conv_3, conv_4], + size=class_dim, + act=activation.Softmax()) + lbl = layer.data("label", data_type.integer_value(2)) + cost = layer.classification_cost(input=output, label=lbl) + return cost + + +def stacked_lstm_net(input_dim, + class_dim=2, + emb_dim=128, + hid_dim=512, + stacked_num=3, + is_predict=False): + """ + A Wrapper for sentiment classification task. + This network uses bi-directional recurrent network, + consisting three LSTM layers. This configure is referred to + the paper as following url, but use fewer layrs. + http://www.aclweb.org/anthology/P15-1109 + + input_dim: here is word dictionary dimension. + class_dim: number of categories. + emb_dim: dimension of word embedding. + hid_dim: dimension of hidden layer. + stacked_num: number of stacked lstm-hidden layer. + is_predict: is predicting or not. + Some layers is not needed in network when predicting. + """ + assert stacked_num % 2 == 1 + + layer_attr = attrs.ExtraLayerAttribute(drop_rate=0.5) + fc_para_attr = attrs.ParameterAttribute(learning_rate=1e-3) + lstm_para_attr = attrs.ParameterAttribute(initial_std=0., learning_rate=1.) + para_attr = [fc_para_attr, lstm_para_attr] + bias_attr = attrs.ParameterAttribute(initial_std=0., l2_rate=0.) + relu = activation.Relu() + linear = activation.Linear() + + data = layer.data("word", data_type.integer_value_sequence(input_dim)) + emb = layer.embedding(input=data, size=emb_dim) + + fc1 = layer.fc(input=emb, size=hid_dim, act=linear, bias_attr=bias_attr) + lstm1 = layer.lstmemory( + input=fc1, act=relu, bias_attr=bias_attr, layer_attr=layer_attr) + + inputs = [fc1, lstm1] + for i in range(2, stacked_num + 1): + fc = layer.fc(input=inputs, + size=hid_dim, + act=linear, + param_attr=para_attr, + bias_attr=bias_attr) + lstm = layer.lstmemory( + input=fc, + reverse=(i % 2) == 0, + act=relu, + bias_attr=bias_attr, + layer_attr=layer_attr) + inputs = [fc, lstm] + + fc_last = layer.pooling(input=inputs[0], pooling_type=MaxPooling()) + lstm_last = layer.pooling(input=inputs[1], pooling_type=MaxPooling()) + output = layer.fc(input=[fc_last, lstm_last], + size=class_dim, + act=activation.Softmax(), + bias_attr=bias_attr, + param_attr=para_attr) + + lbl = layer.data("label", data_type.integer_value(2)) + cost = layer.classification_cost(input=output, label=lbl) + return cost + + +if __name__ == '__main__': + # init + paddle.init(use_gpu=True, trainer_count=4) + + # network config + print 'load dictionary...' + word_dict = imdb.word_dict() + dict_dim = len(word_dict) + class_dim = 2 + + # Please choose the way to build the network + # by uncommenting the corresponding line. + cost = convolution_net(dict_dim, class_dim=class_dim) + # cost = stacked_lstm_net(dict_dim, class_dim=class_dim, stacked_num=3) + + # create parameters + parameters = paddle.parameters.create(cost) + + # create optimizer + adam_optimizer = paddle.optimizer.Adam( + learning_rate=2e-3, + regularization=paddle.optimizer.L2Regularization(rate=8e-4), + model_average=paddle.optimizer.ModelAverage(average_window=0.5)) + + # End batch and end pass event handler + def event_handler(event): + if isinstance(event, paddle.event.EndIteration): + if event.batch_id % 100 == 0: + print "\nPass %d, Batch %d, Cost %f, %s" % ( + event.pass_id, event.batch_id, event.cost, event.metrics) + else: + sys.stdout.write('.') + sys.stdout.flush() + if isinstance(event, paddle.event.EndPass): + result = trainer.test( + reader=paddle.reader.batched( + lambda: imdb.test(word_dict), batch_size=128), + reader_dict={'word': 0, + 'label': 1}) + print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics) + + # create trainer + trainer = paddle.trainer.SGD(cost=cost, + parameters=parameters, + update_equation=adam_optimizer) + + trainer.train( + reader=paddle.reader.batched( + paddle.reader.shuffle( + lambda: imdb.train(word_dict), buf_size=1000), + batch_size=100), + event_handler=event_handler, + reader_dict={'word': 0, + 'label': 1}, + num_passes=10) diff --git a/doc/design/reader/README.md b/doc/design/reader/README.md index 17d52b9e20b8130688028092421f4b33f44763ac..03119fdd74502a4534c2e6a576580ce96a721c7e 100644 --- a/doc/design/reader/README.md +++ b/doc/design/reader/README.md @@ -4,9 +4,10 @@ At training and testing time, PaddlePaddle programs need to read data. To ease t - A *reader* is a function that reads data (from file, network, random number generator, etc) and yields data items. - A *reader creator* is a function that returns a reader function. -- A *reader* decorator is a function, which accepts one or more readers, and returns a reader. +- A *reader decorator* is a function, which accepts one or more readers, and returns a reader. +- A *batch reader* is a function that reads data (from *reader*, file, network, random number generator, etc) and yields a batch of data items. -and provide frequently used reader creators and reader decorators. +and provide function which converts reader to batch reader, frequently used reader creators and reader decorators. ## Data Reader Interface @@ -37,9 +38,54 @@ def reader_creator_random_imageand_label(widht, height, label): return reader ``` +## Batch Reader Interface + +*batch reader* can be any function with no parameter that creates a iterable (anything can be used in `for x in iterable`). The output of the iterable should be a batch (list) of data items. Each item inside the list must be a tuple. + +Here are valid outputs: +```python +# a mini batch of three data items. Each data item consist three columns of data, each of which is 1. +[(1, 1, 1), +(2, 2, 2), +(3, 3, 3)] + +# a mini batch of three data items, each data item is a list (single column). +[([1,1,1],), +([2,2,2],), +([3,3,3],), +``` + +Please note that each item inside the list must be a tuple, below is an invalid output: +```python + # wrong, [1,1,1] needs to be inside a tuple: ([1,1,1],). + # Otherwise it's ambiguous whether [1,1,1] means a single column of data [1, 1, 1], + # or three column of datas, each of which is 1. +[[1,1,1], +[2,2,2], +[3,3,3]] +``` + +It's easy to convert from reader to batch reader: +```python +mnist_train = paddle.dataset.mnist.train() +mnist_train_batch_reader = paddle.batch(mnist_train, 128) +``` + +Also easy to create custom batch reader: +```python +def custom_batch_reader(): + while True: + batch = [] + for i in xrange(128): + batch.append((numpy.random.uniform(-1, 1, 28*28),)) # note that it's a tuple being appended. + yield batch + +mnist_random_image_batch_reader = custom_batch_reader +``` + ## Usage -data reader, mapping from item(s) read to data layer, batch size and number of total pass will be passed into `paddle.train`: +batch reader, mapping from item(s) read to data layer, batch size and number of total pass will be passed into `paddle.train`: ```python # two data layer is created: @@ -47,8 +93,8 @@ image_layer = paddle.layer.data("image", ...) label_layer = paddle.layer.data("label", ...) # ... - -paddle.train(paddle.dataset.mnist, {"image":0, "label":1}, 128, 10, ...) +batch_reader = paddle.batch(paddle.dataset.mnist.train(), 128) +paddle.train(batch_reader, {"image":0, "label":1}, 128, 10, ...) ``` ## Data Reader Decorator @@ -64,7 +110,7 @@ Since reading data may take time and training can not proceed without data. It i Use `paddle.reader.buffered` to prefetch data: ```python -buffered_reader = paddle.reader.buffered(paddle.dataset.mnist, 100) +buffered_reader = paddle.reader.buffered(paddle.dataset.mnist.train(), 100) ``` `buffered_reader` will try to buffer (prefetch) `100` data entries. @@ -91,10 +137,10 @@ def reader_creator_bool(t): true_reader = reader_creator_bool(True) false_reader = reader_creator_bool(False) -reader = paddle.reader.compose(paddle.dataset.mnist, data_reader_creator_random_image(20, 20), true_reader, false_reader) -# Skipped 1 because paddle.dataset.mnist produces two items per data entry. +reader = paddle.reader.compose(paddle.dataset.mnist.train(), data_reader_creator_random_image(20, 20), true_reader, false_reader) +# Skipped 1 because paddle.dataset.mnist.train() produces two items per data entry. # And we don't care second item at this time. -paddle.train(reader, {"true_image":0, "fake_image": 2, "true_label": 3, "false_label": 4}, ...) +paddle.train(paddle.batch(reader, 128), {"true_image":0, "fake_image": 2, "true_label": 3, "false_label": 4}, ...) ``` ### Shuffle @@ -103,16 +149,20 @@ Given shuffle buffer size `n`, `paddle.reader.shuffle` will return a data reader Example: ```python -reader = paddle.reader.shuffle(paddle.dataset.mnist, 512) +reader = paddle.reader.shuffle(paddle.dataset.mnist.train(), 512) ``` ## Q & A -### Why return only a single entry, but not a mini batch? +### Why reader return only a single entry, but not a mini batch? + +Always returning a single entry make reusing existing data readers much easier (e.g., if existing reader return not a single entry but 3 entries, training code will be more complex because it need to handle cases like batch size 2). + +We provide function `paddle.batch` to turn (single entry) reader into batch reader. -If a mini batch is returned, data reader need to take care of batch size. But batch size is a concept for training, it makes more sense for user to specify batch size as a parameter for `train`. +### Why do we need batch reader, isn't train take reader and batch_size as arguments sufficient? -Practically, always return a single entry make reusing existing data readers much easier (e.g., if existing reader return not a single entry but 3 entries, training code will be more complex because it need to handle cases like batch size 2). +In most of the case, train taking reader and batch_size as arguments would be sufficent. However sometimes user want to customize order of data entries inside a mini batch. Or even change batch size dynamically. ### Why use a dictionary but not a list to provide mapping? @@ -137,7 +187,7 @@ def image_reader_creator(image_path, label_path, n): # images_reader_creator creates a reader reader = image_reader_creator("/path/to/image_file", "/path/to/label_file", 1024) -paddle.train(reader, {"image":0, "label":1}, ...) +paddle.train(paddle.batch(reader, 128), {"image":0, "label":1}, ...) ``` ### How is `paddle.train` implemented @@ -145,17 +195,8 @@ paddle.train(reader, {"image":0, "label":1}, ...) An example implementation of paddle.train could be: ```python -def make_minibatch(reader, minibatch_size): - def ret(): - r = reader() - buf = [r.next() for x in xrange(minibatch_size)] - while len(buf) > 0: - yield buf - buf = [r.next() for x in xrange(minibatch_size)] - return ret - -def train(reader, mapping, batch_size, total_pass): +def train(batch_reader, mapping, batch_size, total_pass): for pass_idx in range(total_pass): - for mini_batch in make_minibatch(reader): # this loop will never end in online learning. + for mini_batch in batch_reader(): # this loop will never end in online learning. do_forward_backward(mini_batch, mapping) ``` diff --git a/doc/howto/usage/k8s/src/k8s_train/start_paddle.py b/doc/howto/usage/k8s/src/k8s_train/start_paddle.py index f1a770ccb54fbd7d4c3cf6bf134d00d7bf5961ca..935c12bb67e1fe08bc135a7a2220fcd43c548482 100755 --- a/doc/howto/usage/k8s/src/k8s_train/start_paddle.py +++ b/doc/howto/usage/k8s/src/k8s_train/start_paddle.py @@ -132,7 +132,8 @@ def startPaddle(idMap={}, train_args_dict=None): logDir = JOB_PATH_OUTPUT + "/node_" + str(trainerId) if not os.path.exists(JOB_PATH_OUTPUT): os.makedirs(JOB_PATH_OUTPUT) - os.mkdir(logDir) + if not os.path.exists(logDir): + os.mkdir(logDir) copyCommand = 'cp -rf ' + JOB_PATH + \ "/" + str(trainerId) + "/data/*" + " ./data/" os.system(copyCommand) diff --git a/paddle/scripts/travis/before_install.osx.sh b/paddle/scripts/travis/before_install.osx.sh deleted file mode 100755 index 80f031a74e7052d183b5ef21d432476ff1cce722..0000000000000000000000000000000000000000 --- a/paddle/scripts/travis/before_install.osx.sh +++ /dev/null @@ -1,4 +0,0 @@ -#!/bin/bash -brew update -brew tap homebrew/science -brew install openblas swig md5sha1sum diff --git a/paddle/scripts/travis/build_and_test.sh b/paddle/scripts/travis/build_and_test.sh index 5e6350b57458594163f23cca41a546d7bd9b1eda..7deb3e62e88de7e1306fcbfc5a28aa4372d678e6 100755 --- a/paddle/scripts/travis/build_and_test.sh +++ b/paddle/scripts/travis/build_and_test.sh @@ -2,18 +2,11 @@ source ./common.sh NPROC=1 -if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then - export PYTHONPATH=/opt/python/2.7.12/lib/python2.7/site-packages - export PYTHONHOME=/opt/python/2.7.12 - export PATH=/opt/python/2.7.12/bin:${PATH} - cmake .. -DCMAKE_Fortran_COMPILER=/usr/bin/gfortran-4.8 -DON_TRAVIS=ON -DON_COVERALLS=ON -DCOVERALLS_UPLOAD=ON ${EXTRA_CMAKE_OPTS} - NRPOC=`nproc` - make -j $NPROC - make coveralls - sudo make install -elif [[ "$TRAVIS_OS_NAME" == "osx" ]]; then - export PYTHONPATH=/usr/local/lib/python2.7/site-packages - cmake .. -DON_TRAVIS=ON -DON_COVERALLS=ON -DCOVERALLS_UPLOAD=ON ${EXTRA_CMAKE_OPTS} - NPROC=`sysctl -n hw.ncpu` - make -j $NPROC -fi +export PYTHONPATH=/opt/python/2.7.12/lib/python2.7/site-packages +export PYTHONHOME=/opt/python/2.7.12 +export PATH=/opt/python/2.7.12/bin:${PATH} +cmake .. -DCMAKE_Fortran_COMPILER=/usr/bin/gfortran-4.8 -DON_TRAVIS=ON -DON_COVERALLS=ON -DCOVERALLS_UPLOAD=ON ${EXTRA_CMAKE_OPTS} +NRPOC=`nproc` +make -j $NPROC +make coveralls +sudo make install diff --git a/python/paddle/v2/dataset/__init__.py b/python/paddle/v2/dataset/__init__.py index a0e194c650bebf70b956cfff6be5c75fe8d9f83b..2f7dcf4cfff5897d7182289e295ffdb9d63c636f 100644 --- a/python/paddle/v2/dataset/__init__.py +++ b/python/paddle/v2/dataset/__init__.py @@ -17,6 +17,9 @@ import imikolov import imdb import cifar import movielens +import conll05 import uci_housing -__all__ = ['mnist', 'imikolov', 'imdb', 'cifar', 'movielens', 'uci_housing'] +__all__ = [ + 'mnist', 'imikolov', 'imdb', 'cifar', 'movielens', 'conll05', 'uci_housing' +] diff --git a/python/paddle/v2/dataset/conll05.py b/python/paddle/v2/dataset/conll05.py index 7874161a05968921a8e6789a7ff6a229d6cc6c01..e96a701c1a944e2d6d84f897157cb357c5aa0824 100644 --- a/python/paddle/v2/dataset/conll05.py +++ b/python/paddle/v2/dataset/conll05.py @@ -12,10 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. -import paddle.v2.dataset.common import tarfile import gzip import itertools +from common import download __all__ = ['test, get_dict', 'get_embedding'] """ @@ -160,7 +160,6 @@ def reader_creator(corpus_reader, ctx_p2 = 'eos' word_idx = [word_dict.get(w, UNK_IDX) for w in sentence] - pred_idx = [predicate_dict.get(predicate)] * sen_len ctx_n2_idx = [word_dict.get(ctx_n2, UNK_IDX)] * sen_len ctx_n1_idx = [word_dict.get(ctx_n1, UNK_IDX)] * sen_len @@ -168,38 +167,30 @@ def reader_creator(corpus_reader, ctx_p1_idx = [word_dict.get(ctx_p1, UNK_IDX)] * sen_len ctx_p2_idx = [word_dict.get(ctx_p2, UNK_IDX)] * sen_len + pred_idx = [predicate_dict.get(predicate)] * sen_len label_idx = [label_dict.get(w) for w in labels] - yield word_idx, pred_idx, ctx_n2_idx, ctx_n1_idx, \ - ctx_0_idx, ctx_p1_idx, ctx_p2_idx, mark, label_idx + yield word_idx, ctx_n2_idx, ctx_n1_idx, \ + ctx_0_idx, ctx_p1_idx, ctx_p2_idx, pred_idx, mark, label_idx - return reader() + return reader def get_dict(): - word_dict = load_dict( - common.download(WORDDICT_URL, 'conll05st', WORDDICT_MD5)) - verb_dict = load_dict( - common.download(VERBDICT_URL, 'conll05st', VERBDICT_MD5)) - label_dict = load_dict( - common.download(TRGDICT_URL, 'conll05st', TRGDICT_MD5)) + word_dict = load_dict(download(WORDDICT_URL, 'conll05st', WORDDICT_MD5)) + verb_dict = load_dict(download(VERBDICT_URL, 'conll05st', VERBDICT_MD5)) + label_dict = load_dict(download(TRGDICT_URL, 'conll05st', TRGDICT_MD5)) return word_dict, verb_dict, label_dict def get_embedding(): - return common.download(EMB_URL, 'conll05st', EMB_MD5) + return download(EMB_URL, 'conll05st', EMB_MD5) def test(): word_dict, verb_dict, label_dict = get_dict() reader = corpus_reader( - common.download(DATA_URL, 'conll05st', DATA_MD5), + download(DATA_URL, 'conll05st', DATA_MD5), words_name='conll05st-release/test.wsj/words/test.wsj.words.gz', props_name='conll05st-release/test.wsj/props/test.wsj.props.gz') return reader_creator(reader, word_dict, verb_dict, label_dict) - - -if __name__ == '__main__': - print get_embedding() - for f in test(): - print f diff --git a/python/paddle/v2/dataset/imdb.py b/python/paddle/v2/dataset/imdb.py index ffd7d89049358e979db762af07701e010f40dc6e..f27756a38a9cd809fdaaf92e7f8a72b681915fc8 100644 --- a/python/paddle/v2/dataset/imdb.py +++ b/python/paddle/v2/dataset/imdb.py @@ -116,3 +116,8 @@ def test(word_idx): return reader_creator( re.compile("aclImdb/test/pos/.*\.txt$"), re.compile("aclImdb/test/neg/.*\.txt$"), word_idx, 1000) + + +def word_dict(): + return build_dict( + re.compile("aclImdb/((train)|(test))/((pos)|(neg))/.*\.txt$"), 150) diff --git a/python/paddle/v2/event.py b/python/paddle/v2/event.py index a78bcf076cc65e0dfdfc5760e099900418162f35..a429e36b63c9e812332673b66f4d8b99f3303cf8 100644 --- a/python/paddle/v2/event.py +++ b/python/paddle/v2/event.py @@ -34,8 +34,9 @@ class WithMetric(object): class TestResult(WithMetric): - def __init__(self, evaluator): + def __init__(self, evaluator, cost): super(TestResult, self).__init__(evaluator) + self.cost = cost class BeginPass(object): diff --git a/python/paddle/v2/trainer.py b/python/paddle/v2/trainer.py index e743a49523ff21627ea2abfb76cee8b9ffd685e2..b4a713f7d53c2c5aef7e356906f88475037fa8d2 100644 --- a/python/paddle/v2/trainer.py +++ b/python/paddle/v2/trainer.py @@ -108,9 +108,6 @@ class SGD(ITrainer): pass_evaluator.start() updater.startPass() for batch_id, data_batch in enumerate(reader()): - pass_type = updater.startBatch(len(data_batch)) - self.__gradient_machine__.forwardBackward( - feeder(data_batch), out_args, pass_type) batch_evaluator.start() event_handler( v2_event.BeginIteration( @@ -123,10 +120,8 @@ class SGD(ITrainer): for each_param in self.__gradient_machine__.getNonStaticParameters( ): updater.update(each_param) - # Get cost. We use numpy to calculate total cost for this batch. - cost_vec = out_args.getSlotValue(0) - cost_vec = cost_vec.copyToNumpyMat() - cost = cost_vec.sum() / len(data_batch) + cost_sum = out_args.sumCosts() + cost = cost_sum / len(data_batch) updater.finishBatch(cost) batch_evaluator.finish() event_handler( @@ -155,13 +150,18 @@ class SGD(ITrainer): evaluator = self.__gradient_machine__.makeEvaluator() out_args = api.Arguments.createArguments(0) evaluator.start() + total_cost = 0 + num_samples = 0.0 for data_batch in reader(): + num_samples += len(data_batch) self.__gradient_machine__.forward( feeder(data_batch), out_args, api.PASS_TEST) + total_cost += out_args.sumCosts() self.__gradient_machine__.eval(evaluator) evaluator.finish() - return v2_event.TestResult(evaluator=evaluator) + return v2_event.TestResult( + evaluator=evaluator, cost=total_cost / num_samples) def __check_train_args__(reader, event_handler, **kwargs):