diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake index 26da7e8e384bafdcbcd1a358c39cc6eb167b067e..1575d8e9f5613e972df672b1daae145595676e8b 100644 --- a/cmake/external/protobuf.cmake +++ b/cmake/external/protobuf.cmake @@ -14,7 +14,7 @@ INCLUDE(ExternalProject) -FIND_PACKAGE(Protobuf) +FIND_PACKAGE(Protobuf 3.1) IF(NOT PROTOBUF_FOUND) SET(PROTOBUF_SOURCES_DIR ${THIRD_PARTY_PATH}/protobuf) diff --git a/demo/mnist/api_train_v2.py b/demo/mnist/api_train_v2.py index 3aa2199bcb7126325e573f3c84442b52f4a3f21c..6b95a88042a13a280bcb80f753b3887fcef37296 100644 --- a/demo/mnist/api_train_v2.py +++ b/demo/mnist/api_train_v2.py @@ -92,12 +92,8 @@ def main(): def event_handler(event): if isinstance(event, paddle.event.EndIteration): if event.batch_id % 1000 == 0: - result = trainer.test(reader=paddle.batch( - paddle.dataset.mnist.test(), batch_size=256)) - - print "Pass %d, Batch %d, Cost %f, %s, Testing metrics %s" % ( - event.pass_id, event.batch_id, event.cost, event.metrics, - result.metrics) + print "Pass %d, Batch %d, Cost %f, %s" % ( + event.pass_id, event.batch_id, event.cost, event.metrics) with gzip.open('params.tar.gz', 'w') as f: parameters.to_tar(f) diff --git a/demo/recommendation/api_train_v2.py b/demo/recommendation/api_train_v2.py new file mode 100644 index 0000000000000000000000000000000000000000..9b254933a1de60bf8d74517f0d52401d334703b7 --- /dev/null +++ b/demo/recommendation/api_train_v2.py @@ -0,0 +1,125 @@ +import paddle.v2 as paddle +import cPickle +import copy + + +def main(): + paddle.init(use_gpu=False) + movie_title_dict = paddle.dataset.movielens.get_movie_title_dict() + uid = paddle.layer.data( + name='user_id', + type=paddle.data_type.integer_value( + paddle.dataset.movielens.max_user_id() + 1)) + usr_emb = paddle.layer.embedding(input=uid, size=32) + + usr_gender_id = paddle.layer.data( + name='gender_id', type=paddle.data_type.integer_value(2)) + usr_gender_emb = paddle.layer.embedding(input=usr_gender_id, size=16) + + usr_age_id = paddle.layer.data( + name='age_id', + type=paddle.data_type.integer_value( + len(paddle.dataset.movielens.age_table))) + usr_age_emb = paddle.layer.embedding(input=usr_age_id, size=16) + + usr_job_id = paddle.layer.data( + name='job_id', + type=paddle.data_type.integer_value(paddle.dataset.movielens.max_job_id( + ) + 1)) + + usr_job_emb = paddle.layer.embedding(input=usr_job_id, size=16) + + usr_combined_features = paddle.layer.fc( + input=[usr_emb, usr_gender_emb, usr_age_emb, usr_job_emb], + size=200, + act=paddle.activation.Tanh()) + + mov_id = paddle.layer.data( + name='movie_id', + type=paddle.data_type.integer_value( + paddle.dataset.movielens.max_movie_id() + 1)) + mov_emb = paddle.layer.embedding(input=mov_id, size=32) + + mov_categories = paddle.layer.data( + name='category_id', + type=paddle.data_type.sparse_binary_vector( + len(paddle.dataset.movielens.movie_categories()))) + + mov_categories_hidden = paddle.layer.fc(input=mov_categories, size=32) + + mov_title_id = paddle.layer.data( + name='movie_title', + type=paddle.data_type.integer_value_sequence(len(movie_title_dict))) + mov_title_emb = paddle.layer.embedding(input=mov_title_id, size=32) + mov_title_conv = paddle.networks.sequence_conv_pool( + input=mov_title_emb, hidden_size=32, context_len=3) + + mov_combined_features = paddle.layer.fc( + input=[mov_emb, mov_categories_hidden, mov_title_conv], + size=200, + act=paddle.activation.Tanh()) + + inference = paddle.layer.cos_sim( + a=usr_combined_features, b=mov_combined_features, size=1, scale=5) + cost = paddle.layer.regression_cost( + input=inference, + label=paddle.layer.data( + name='score', type=paddle.data_type.dense_vector(1))) + + parameters = paddle.parameters.create(cost) + + trainer = paddle.trainer.SGD(cost=cost, + parameters=parameters, + update_equation=paddle.optimizer.Adam( + learning_rate=1e-4)) + feeding = { + 'user_id': 0, + 'gender_id': 1, + 'age_id': 2, + 'job_id': 3, + 'movie_id': 4, + 'category_id': 5, + 'movie_title': 6, + 'score': 7 + } + + def event_handler(event): + if isinstance(event, paddle.event.EndIteration): + if event.batch_id % 100 == 0: + print "Pass %d Batch %d Cost %.2f" % ( + event.pass_id, event.batch_id, event.cost) + + trainer.train( + reader=paddle.batch( + paddle.reader.shuffle( + paddle.dataset.movielens.train(), buf_size=8192), + batch_size=256), + event_handler=event_handler, + feeding=feeding, + num_passes=1) + + user_id = 234 + movie_id = 345 + + user = paddle.dataset.movielens.user_info()[user_id] + movie = paddle.dataset.movielens.movie_info()[movie_id] + + feature = user.value() + movie.value() + + def reader(): + yield feature + + infer_dict = copy.copy(feeding) + del infer_dict['score'] + + prediction = paddle.infer( + output=inference, + parameters=parameters, + reader=paddle.batch( + reader, batch_size=32), + feeding=infer_dict) + print(prediction + 5) / 2 + + +if __name__ == '__main__': + main() diff --git a/python/paddle/v2/data_feeder.py b/python/paddle/v2/data_feeder.py index ba77fecf21eecf9115cc1b20720383b790294eb0..bda8e22fd282f8ff4a820e4ecb6b3bb421d57890 100644 --- a/python/paddle/v2/data_feeder.py +++ b/python/paddle/v2/data_feeder.py @@ -85,6 +85,9 @@ class DataFeeder(DataProviderConverter): input_types.append(each[1]) DataProviderConverter.__init__(self, input_types) + def __len__(self): + return len(self.input_names) + def convert(self, dat, argument=None): """ :param dat: A list of mini-batch data. Each sample is a list or tuple diff --git a/python/paddle/v2/dataset/movielens.py b/python/paddle/v2/dataset/movielens.py index dc65e8f8b6f04b078a3449c622478095086cecbe..25fd8227da2f219d75c6b830e65627ecf35be453 100644 --- a/python/paddle/v2/dataset/movielens.py +++ b/python/paddle/v2/dataset/movielens.py @@ -23,7 +23,12 @@ import re import random import functools -__all__ = ['train_creator', 'test_creator'] +__all__ = [ + 'train', 'test', 'get_movie_title_dict', 'max_movie_id', 'max_user_id', + 'age_table', 'movie_categories', 'max_job_id', 'user_info', 'movie_info' +] + +age_table = [1, 18, 25, 35, 45, 50, 56] class MovieInfo(object): @@ -38,17 +43,32 @@ class MovieInfo(object): [MOVIE_TITLE_DICT[w.lower()] for w in self.title.split()] ] + def __str__(self): + return "" % ( + self.index, self.title, self.categories) + + def __repr__(self): + return self.__str__() + class UserInfo(object): def __init__(self, index, gender, age, job_id): self.index = int(index) self.is_male = gender == 'M' - self.age = [1, 18, 25, 35, 45, 50, 56].index(int(age)) + self.age = age_table.index(int(age)) self.job_id = int(job_id) def value(self): return [self.index, 0 if self.is_male else 1, self.age, self.job_id] + def __str__(self): + return "" % ( + self.index, "M" + if self.is_male else "F", age_table[self.age], self.job_id) + + def __repr__(self): + return str(self) + MOVIE_INFO = None MOVIE_TITLE_DICT = None @@ -59,7 +79,8 @@ USER_INFO = None def __initialize_meta_info__(): fn = download( url='http://files.grouplens.org/datasets/movielens/ml-1m.zip', - md5='c4d9eecfca2ab87c1945afe126590906') + module_name='movielens', + md5sum='c4d9eecfca2ab87c1945afe126590906') global MOVIE_INFO if MOVIE_INFO is None: pattern = re.compile(r'^(.*)\((\d+)\)$') @@ -122,14 +143,63 @@ def __reader_creator__(**kwargs): return lambda: __reader__(**kwargs) -train_creator = functools.partial(__reader_creator__, is_test=False) -test_creator = functools.partial(__reader_creator__, is_test=True) +train = functools.partial(__reader_creator__, is_test=False) +test = functools.partial(__reader_creator__, is_test=True) + + +def get_movie_title_dict(): + __initialize_meta_info__() + return MOVIE_TITLE_DICT + + +def __max_index_info__(a, b): + if a.index > b.index: + return a + else: + return b + + +def max_movie_id(): + __initialize_meta_info__() + return reduce(__max_index_info__, MOVIE_INFO.viewvalues()).index + + +def max_user_id(): + __initialize_meta_info__() + return reduce(__max_index_info__, USER_INFO.viewvalues()).index + + +def __max_job_id_impl__(a, b): + if a.job_id > b.job_id: + return a + else: + return b + + +def max_job_id(): + __initialize_meta_info__() + return reduce(__max_job_id_impl__, USER_INFO.viewvalues()).job_id + + +def movie_categories(): + __initialize_meta_info__() + return CATEGORIES_DICT + + +def user_info(): + __initialize_meta_info__() + return USER_INFO + + +def movie_info(): + __initialize_meta_info__() + return MOVIE_INFO def unittest(): - for train_count, _ in enumerate(train_creator()()): + for train_count, _ in enumerate(train()()): pass - for test_count, _ in enumerate(test_creator()()): + for test_count, _ in enumerate(test()()): pass print train_count, test_count diff --git a/python/paddle/v2/inference.py b/python/paddle/v2/inference.py index 53510d80c9d92b42ebbe120cc6f4166b09198ae5..4065f7fe11258da58bd0e03888b2edaf82fc8543 100644 --- a/python/paddle/v2/inference.py +++ b/python/paddle/v2/inference.py @@ -23,7 +23,7 @@ class Inference(object): def iter_infer(self, input=None, batch_size=None, reader=None, feeding=None): - + feeder = DataFeeder(self.__data_types__, feeding) if reader is None: assert input is not None and isinstance(input, collections.Iterable) if not isinstance(input, collections.Iterable): @@ -45,8 +45,6 @@ class Inference(object): if input is not None: raise ValueError("User should set either input or reader, " "should not set them both.") - - feeder = DataFeeder(self.__data_types__, feeding) self.__gradient_machine__.start() for data_batch in reader(): yield self.__gradient_machine__.forwardTest(feeder(data_batch)) @@ -70,7 +68,7 @@ class Inference(object): return retv -def infer(output_layer, parameters, input=None, feeding=None, field='value'): +def infer(output_layer, parameters, input, feeding=None, field='value'): """ Infer a neural network by given neural network output and parameters. The user should pass either a batch of input data or reader method.