diff --git a/demo/gan/data/download_cifar.sh b/demo/gan/data/download_cifar.sh index ae24ef2b7f2012fb719037d4868bdf0e7f9ce71d..bbadc7c10c73e45a0948018b8812f79040d14bc4 100755 --- a/demo/gan/data/download_cifar.sh +++ b/demo/gan/data/download_cifar.sh @@ -1,3 +1,4 @@ +#!/bin/bash # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/demo/gan/data/get_mnist_data.sh b/demo/gan/data/get_mnist_data.sh old mode 100644 new mode 100755 diff --git a/demo/image_classification/data/download_cifar.sh b/demo/image_classification/data/download_cifar.sh index 52e82d0d9812c88e5c85cffc0585e3425b862809..532178d627fe19ab8ea79ecae73e5328b5294bea 100755 --- a/demo/image_classification/data/download_cifar.sh +++ b/demo/image_classification/data/download_cifar.sh @@ -1,3 +1,4 @@ +#!/bin/bash # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/demo/image_classification/image_provider.py b/demo/image_classification/image_provider.py index 87eed5eebd7680e578c822083efb8b9eab16b266..6a315ff094c1af5f8250d8a22ff96740dddd9808 100644 --- a/demo/image_classification/image_provider.py +++ b/demo/image_classification/image_provider.py @@ -21,7 +21,7 @@ from paddle.trainer.PyDataProvider2 import * # # {'img_size': 32, -# 'settings': , +# 'settings': a global object, # 'color': True, # 'mean_img_size': 32, # 'meta': './data/cifar-out/batches/batches.meta', @@ -50,10 +50,10 @@ def hook(settings, img_size, mean_img_size, num_classes, color, meta, use_jpeg, settings.logger.info('Image size: %s', settings.img_size) settings.logger.info('Meta path: %s', settings.meta_path) - settings.input_types = [ - dense_vector(settings.img_raw_size), # image feature - integer_value(settings.num_classes) - ] # labels + settings.input_types = { + 'image': dense_vector(settings.img_raw_size), + 'label': integer_value(settings.num_classes) + } settings.logger.info('DataProvider Initialization finished') @@ -83,4 +83,7 @@ def processData(settings, file_list): img, settings.img_mean, settings.img_size, settings.is_train, settings.color) label = data['labels'][i] - yield img_feat.astype('float32'), int(label) + yield { + 'image': img_feat.astype('float32'), + 'label': int(label) + } diff --git a/demo/introduction/.gitignore b/demo/introduction/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..c54f3f9480ce4ceefda98f77a812ec2d6cd4a5e3 --- /dev/null +++ b/demo/introduction/.gitignore @@ -0,0 +1,5 @@ +dataprovider.pyc +empty.list +train.log +output +train.list diff --git a/demo/introduction/dataprovider.py b/demo/introduction/dataprovider.py index 03c920cc34b397643e97ad41cf06458245c7ca7b..5b48aad0408800676ae7da16eba2dcbb2124f25f 100644 --- a/demo/introduction/dataprovider.py +++ b/demo/introduction/dataprovider.py @@ -17,8 +17,10 @@ import random # define data types of input: 2 real numbers -@provider(input_types=[dense_vector(1), dense_vector(1)], use_seq=False) +@provider( + input_types={'x': dense_vector(1), + 'y': dense_vector(1)}, use_seq=False) def process(settings, input_file): for i in xrange(2000): x = random.random() - yield [x], [2 * x + 0.3] + yield {'x': [x], 'y': [2 * x + 0.3]} diff --git a/demo/introduction/trainer_config.py b/demo/introduction/trainer_config.py index 41cebcf6e146e55efb89c2ceea429fa003ff206e..ecafe955f9e5c1062168d5d7b6b4c639d6e72a99 100644 --- a/demo/introduction/trainer_config.py +++ b/demo/introduction/trainer_config.py @@ -15,11 +15,8 @@ from paddle.trainer_config_helpers import * # 1. read data. Suppose you saved above python code as dataprovider.py -data_file = 'empty.list' -with open(data_file, 'w') as f: - f.writelines(' ') define_py_data_sources2( - train_list=data_file, + train_list=['no_matter.txt'], test_list=None, module='dataprovider', obj='process', diff --git a/demo/quick_start/.gitignore b/demo/quick_start/.gitignore index d6bc73105b1abfdae3067b7fecd656079a56b57c..f71662563ff96d6227dd568d9951a90b0d09456e 100644 --- a/demo/quick_start/.gitignore +++ b/demo/quick_start/.gitignore @@ -8,6 +8,8 @@ data/test.list data/test.txt data/train.list data/train.txt +data/pred.list +data/pred.txt dataprovider_copy_1.py train.log output diff --git a/demo/quick_start/dataprovider_bow.py b/demo/quick_start/dataprovider_bow.py index 8e651d77bf3fd3bbd990ef314456ec14bd77cfeb..2745495586449b5d1eb64ae570f73eb6b14dbdfe 100644 --- a/demo/quick_start/dataprovider_bow.py +++ b/demo/quick_start/dataprovider_bow.py @@ -31,16 +31,16 @@ def initializer(settings, dictionary, **kwargs): # setting.input_types specifies what the data types the data provider # generates. - settings.input_types = [ + settings.input_types = { # The first input is a sparse_binary_vector, # which means each dimension of the vector is either 0 or 1. It is the # bag-of-words (BOW) representation of the texts. - sparse_binary_vector(len(dictionary)), + 'word': sparse_binary_vector(len(dictionary)), # The second input is an integer. It represents the category id of the # sample. 2 means there are two labels in the dataset. # (1 for positive and 0 for negative) - integer_value(2) - ] + 'label': integer_value(2) + } # Delaring a data provider. It has an initializer 'data_initialzer'. @@ -67,12 +67,12 @@ def process(settings, file_name): # Return the features for the current comment. The first is a list # of ids representing a 0-1 binary sparse vector of the text, # the second is the integer id of the label. - yield word_vector, int(label) + yield {'word': word_vector, 'label': int(label)} def predict_initializer(settings, dictionary, **kwargs): settings.word_dict = dictionary - settings.input_types = [sparse_binary_vector(len(dictionary))] + settings.input_types = {'word': sparse_binary_vector(len(dictionary))} # Declaring a data provider for prediction. The difference with process @@ -83,4 +83,4 @@ def process_predict(settings, file_name): for line in f: comment = line.strip().split() word_vector = [settings.word_dict.get(w, UNK_IDX) for w in comment] - yield word_vector + yield {'word': word_vector} diff --git a/demo/quick_start/dataprovider_emb.py b/demo/quick_start/dataprovider_emb.py index b010253a8a764ede4ff0416231ac6aa2fd8f94e3..ddfa3ce9b73555cb3b7f5a44314ca35b12d41ede 100755 --- a/demo/quick_start/dataprovider_emb.py +++ b/demo/quick_start/dataprovider_emb.py @@ -19,13 +19,13 @@ UNK_IDX = 0 def initializer(settings, dictionary, **kwargs): settings.word_dict = dictionary - settings.input_types = [ + settings.input_types = { # Define the type of the first input as sequence of integer. # The value of the integers range from 0 to len(dictrionary)-1 - integer_value_sequence(len(dictionary)), + 'word': integer_value_sequence(len(dictionary)), # Define the second input for label id - integer_value(2) - ] + 'label': integer_value(2) + } @provider(init_hook=initializer, cache=CacheType.CACHE_PASS_IN_MEM) @@ -35,15 +35,12 @@ def process(settings, file_name): label, comment = line.strip().split('\t') words = comment.split() word_slot = [settings.word_dict.get(w, UNK_IDX) for w in words] - yield word_slot, int(label) + yield {'word': word_slot, 'label': int(label)} def predict_initializer(settings, dictionary, **kwargs): settings.word_dict = dictionary - settings.input_types = [ - integer_value( - len(dictionary), seq_type=SequenceType.SEQUENCE) - ] + settings.input_types = {'word': integer_value_sequence(len(dictionary))} @provider(init_hook=predict_initializer, should_shuffle=False) @@ -52,4 +49,4 @@ def process_predict(settings, file_name): for line in f: comment = line.strip().split() word_slot = [settings.word_dict.get(w, UNK_IDX) for w in comment] - yield word_slot + yield {'word': word_slot} diff --git a/demo/recommendation/common_utils.py b/demo/recommendation/common_utils.py index d4fbdad1d7ac53b35d9478c65ab61c2d28845261..c20c65286621d701ad58409b539bbe9c813d453a 100755 --- a/demo/recommendation/common_utils.py +++ b/demo/recommendation/common_utils.py @@ -17,13 +17,14 @@ from paddle.trainer.PyDataProvider2 import * def meta_to_header(meta, name): metas = meta[name]['__meta__']['raw_meta'] for each_meta in metas: + slot_name = each_meta.get('name', '%s_id' % name) if each_meta['type'] == 'id': - yield integer_value(each_meta['max']) + yield slot_name, integer_value(each_meta['max']) elif each_meta['type'] == 'embedding': is_seq = each_meta['seq'] == 'sequence' - yield integer_value( + yield slot_name, integer_value( len(each_meta['dict']), seq_type=SequenceType.SEQUENCE if is_seq else SequenceType.NO_SEQUENCE) elif each_meta['type'] == 'one_hot_dense': - yield dense_vector(len(each_meta['dict'])) + yield slot_name, dense_vector(len(each_meta['dict'])) diff --git a/demo/recommendation/dataprovider.py b/demo/recommendation/dataprovider.py index 80c62d75612e544c5197f878a83284f8e08d1a99..c4ff96d80e81926049c9a71d6d9d991c0b568c25 100755 --- a/demo/recommendation/dataprovider.py +++ b/demo/recommendation/dataprovider.py @@ -16,6 +16,14 @@ from paddle.trainer.PyDataProvider2 import * import common_utils # parse +def __list_to_map__(lst): + ret_val = dict() + for each in lst: + k, v = each + ret_val[k] = v + return ret_val + + def hook(settings, meta, **kwargs): """ Init hook is invoked before process data. It will set obj.slots and store @@ -34,12 +42,16 @@ def hook(settings, meta, **kwargs): # second part is user features. # final part is rating score. # header is a list of [USE_SEQ_OR_NOT?, SlotType] - headers = list(common_utils.meta_to_header(meta, 'movie')) - headers.extend(list(common_utils.meta_to_header(meta, 'user'))) - headers.append(dense_vector(1)) # Score + movie_headers = list(common_utils.meta_to_header(meta, 'movie')) + settings.movie_names = [h[0] for h in movie_headers] + headers = movie_headers + user_headers = list(common_utils.meta_to_header(meta, 'user')) + settings.user_names = [h[0] for h in user_headers] + headers.extend(user_headers) + headers.append(("rating", dense_vector(1))) # Score # slot types. - settings.input_types = headers + settings.input_types = __list_to_map__(headers) settings.meta = meta @@ -57,20 +69,20 @@ def process(settings, filename): movie_meta = settings.meta['movie'][movie_id] user_meta = settings.meta['user'][user_id] - outputs = [movie_id - 1] + outputs = [('movie_id', movie_id - 1)] # Then add movie features - for each_meta in movie_meta: - outputs.append(each_meta) + for i, each_meta in enumerate(movie_meta): + outputs.append((settings.movie_names[i + 1], each_meta)) # Then add user id. - outputs.append(user_id - 1) + outputs.append(('user_id', user_id - 1)) # Then add user features. - for each_meta in user_meta: - outputs.append(each_meta) + for i, each_meta in enumerate(user_meta): + outputs.append((settings.user_names[i + 1], each_meta)) # Finally, add score - outputs.append([score]) + outputs.append(('rating', [score])) # Return data to paddle - yield outputs + yield __list_to_map__(outputs) diff --git a/demo/recommendation/prediction.py b/demo/recommendation/prediction.py index 191120188ef5dbddf4c42a1356a9fa46e16c5ca1..8ad993eab3a9f637cfff752bfedbbc62eaf3c8d5 100755 --- a/demo/recommendation/prediction.py +++ b/demo/recommendation/prediction.py @@ -34,8 +34,8 @@ if __name__ == '__main__': network.loadParameters(model_path) with open('./data/meta.bin', 'rb') as f: meta = pickle.load(f) - headers = list(meta_to_header(meta, 'movie')) - headers.extend(list(meta_to_header(meta, 'user'))) + headers = [h[1] for h in meta_to_header(meta, 'movie')] + headers.extend([h[1] for h in meta_to_header(meta, 'user')]) cvt = DataProviderConverter(headers) while True: movie_id = int(raw_input("Input movie_id: ")) diff --git a/demo/recommendation/preprocess.sh b/demo/recommendation/preprocess.sh index e121e470193fa1e73c000fe612d6858e28f9261f..dc6b2cdfc13273026bb9b2b36677ef8cac454d3a 100755 --- a/demo/recommendation/preprocess.sh +++ b/demo/recommendation/preprocess.sh @@ -25,7 +25,7 @@ python meta_generator.py $dir meta.bin --config=meta_config.json echo 'split train/test file' python split.py $dir/ratings.dat --delimiter=${delimiter} --test_ratio=0.1 echo 'shuffle train file' -shuf $dir/ratings.dat.train > ratings.dat.train +gshuf $dir/ratings.dat.train > ratings.dat.train cp $dir/ratings.dat.test . echo "./data/ratings.dat.train" > train.list echo "./data/ratings.dat.test" > test.list diff --git a/demo/semantic_role_labeling/.gitignore b/demo/semantic_role_labeling/.gitignore index cd90ca7bbe9be46f54cb656a8067c794a55d8cfc..65c9b674c7d1dad53b7d1c6ee1dcbdb72553888d 100644 --- a/demo/semantic_role_labeling/.gitignore +++ b/demo/semantic_role_labeling/.gitignore @@ -8,3 +8,7 @@ data/test.wsj.seq_pair data/test.wsj.words data/tgt.dict output +data/emb +data/targetDict.txt +data/verbDict.txt +data/wordDict.txt diff --git a/demo/semantic_role_labeling/data/get_data.sh b/demo/semantic_role_labeling/data/get_data.sh old mode 100644 new mode 100755 diff --git a/python/paddle/trainer_config_helpers/data_sources.py b/python/paddle/trainer_config_helpers/data_sources.py index 0fcf993d57bb3e08c8711a20b16ad94d9c91db1e..d7cb95c477133823f9147e2085c9c609916f16e8 100644 --- a/python/paddle/trainer_config_helpers/data_sources.py +++ b/python/paddle/trainer_config_helpers/data_sources.py @@ -69,7 +69,7 @@ def define_py_data_source(file_list, """ if isinstance(file_list, list): file_list_name = 'train.list' - if isinstance(cls, TestData): + if cls == TestData: file_list_name = 'test.list' with open(file_list_name, 'w') as f: f.writelines(file_list) diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py index 8dd6b7b7d28f841d7d7657b8ef3c25188c2f086e..c10fa671bdbf7c0a5e34c183533e04bea76029d4 100644 --- a/python/paddle/trainer_config_helpers/layers.py +++ b/python/paddle/trainer_config_helpers/layers.py @@ -970,7 +970,7 @@ def pooling_layer(input, :param layer_attr: The Extra Attributes for layer, such as dropout. :type layer_attr: ExtraLayerAttribute|None :return: LayerOutput object. - :rtype: LayerType + :rtype: LayerOutput """ extra_dict = dict() # noinspection PyUnresolvedReferences