diff --git a/demo/mnist/api_train_v2.py b/demo/mnist/api_train_v2.py index 68761be80f24f074c041109d6769e84fa7204367..cc893ef0f5748906225570a06da0d8e8bef63460 100644 --- a/demo/mnist/api_train_v2.py +++ b/demo/mnist/api_train_v2.py @@ -92,12 +92,8 @@ def main(): def event_handler(event): if isinstance(event, paddle.event.EndIteration): if event.batch_id % 1000 == 0: - result = trainer.test(reader=paddle.batch( - paddle.dataset.mnist.test(), batch_size=256)) - - print "Pass %d, Batch %d, Cost %f, %s, Testing metrics %s" % ( - event.pass_id, event.batch_id, event.cost, event.metrics, - result.metrics) + print "Pass %d, Batch %d, Cost %f, %s" % ( + event.pass_id, event.batch_id, event.cost, event.metrics) with gzip.open('params.tar.gz', 'w') as f: parameters.to_tar(f) @@ -123,17 +119,16 @@ def main(): print 'Best pass is %s, testing Avgcost is %s' % (best[0], best[1]) print 'The classification accuracy is %.2f%%' % (100 - float(best[2]) * 100) + test_creator = paddle.dataset.mnist.test() + test_data = [] + for item in test_creator(): + test_data.append(item[0]) + if len(test_data) == 100: + break + # output is a softmax layer. It returns probabilities. # Shape should be (100, 10) - probs = paddle.infer( - output=predict, - parameters=parameters, - reader=paddle.batch( - paddle.reader.firstn( - paddle.reader.map_readers(lambda item: (item[0], ), - paddle.dataset.mnist.test()), - n=100), - batch_size=32)) + probs = paddle.infer(output=predict, parameters=parameters, input=test_data) print probs.shape diff --git a/demo/recommendation/api_train_v2.py b/demo/recommendation/api_train_v2.py new file mode 100644 index 0000000000000000000000000000000000000000..9b254933a1de60bf8d74517f0d52401d334703b7 --- /dev/null +++ b/demo/recommendation/api_train_v2.py @@ -0,0 +1,125 @@ +import paddle.v2 as paddle +import cPickle +import copy + + +def main(): + paddle.init(use_gpu=False) + movie_title_dict = paddle.dataset.movielens.get_movie_title_dict() + uid = paddle.layer.data( + name='user_id', + type=paddle.data_type.integer_value( + paddle.dataset.movielens.max_user_id() + 1)) + usr_emb = paddle.layer.embedding(input=uid, size=32) + + usr_gender_id = paddle.layer.data( + name='gender_id', type=paddle.data_type.integer_value(2)) + usr_gender_emb = paddle.layer.embedding(input=usr_gender_id, size=16) + + usr_age_id = paddle.layer.data( + name='age_id', + type=paddle.data_type.integer_value( + len(paddle.dataset.movielens.age_table))) + usr_age_emb = paddle.layer.embedding(input=usr_age_id, size=16) + + usr_job_id = paddle.layer.data( + name='job_id', + type=paddle.data_type.integer_value(paddle.dataset.movielens.max_job_id( + ) + 1)) + + usr_job_emb = paddle.layer.embedding(input=usr_job_id, size=16) + + usr_combined_features = paddle.layer.fc( + input=[usr_emb, usr_gender_emb, usr_age_emb, usr_job_emb], + size=200, + act=paddle.activation.Tanh()) + + mov_id = paddle.layer.data( + name='movie_id', + type=paddle.data_type.integer_value( + paddle.dataset.movielens.max_movie_id() + 1)) + mov_emb = paddle.layer.embedding(input=mov_id, size=32) + + mov_categories = paddle.layer.data( + name='category_id', + type=paddle.data_type.sparse_binary_vector( + len(paddle.dataset.movielens.movie_categories()))) + + mov_categories_hidden = paddle.layer.fc(input=mov_categories, size=32) + + mov_title_id = paddle.layer.data( + name='movie_title', + type=paddle.data_type.integer_value_sequence(len(movie_title_dict))) + mov_title_emb = paddle.layer.embedding(input=mov_title_id, size=32) + mov_title_conv = paddle.networks.sequence_conv_pool( + input=mov_title_emb, hidden_size=32, context_len=3) + + mov_combined_features = paddle.layer.fc( + input=[mov_emb, mov_categories_hidden, mov_title_conv], + size=200, + act=paddle.activation.Tanh()) + + inference = paddle.layer.cos_sim( + a=usr_combined_features, b=mov_combined_features, size=1, scale=5) + cost = paddle.layer.regression_cost( + input=inference, + label=paddle.layer.data( + name='score', type=paddle.data_type.dense_vector(1))) + + parameters = paddle.parameters.create(cost) + + trainer = paddle.trainer.SGD(cost=cost, + parameters=parameters, + update_equation=paddle.optimizer.Adam( + learning_rate=1e-4)) + feeding = { + 'user_id': 0, + 'gender_id': 1, + 'age_id': 2, + 'job_id': 3, + 'movie_id': 4, + 'category_id': 5, + 'movie_title': 6, + 'score': 7 + } + + def event_handler(event): + if isinstance(event, paddle.event.EndIteration): + if event.batch_id % 100 == 0: + print "Pass %d Batch %d Cost %.2f" % ( + event.pass_id, event.batch_id, event.cost) + + trainer.train( + reader=paddle.batch( + paddle.reader.shuffle( + paddle.dataset.movielens.train(), buf_size=8192), + batch_size=256), + event_handler=event_handler, + feeding=feeding, + num_passes=1) + + user_id = 234 + movie_id = 345 + + user = paddle.dataset.movielens.user_info()[user_id] + movie = paddle.dataset.movielens.movie_info()[movie_id] + + feature = user.value() + movie.value() + + def reader(): + yield feature + + infer_dict = copy.copy(feeding) + del infer_dict['score'] + + prediction = paddle.infer( + output=inference, + parameters=parameters, + reader=paddle.batch( + reader, batch_size=32), + feeding=infer_dict) + print(prediction + 5) / 2 + + +if __name__ == '__main__': + main() diff --git a/doc/api/v2/run_logic.rst b/doc/api/v2/run_logic.rst index 904d45966dfc16a474016ff48fd5a951988b0ab0..0f807873ff9a16263920fa73bf70316df3336d0b 100644 --- a/doc/api/v2/run_logic.rst +++ b/doc/api/v2/run_logic.rst @@ -2,6 +2,7 @@ Trainer API ########### + ========== Parameters ========== @@ -24,3 +25,10 @@ Event .. automodule:: paddle.v2.event :members: + + +========= +Inference +========= + +.. autofunction:: paddle.v2.infer \ No newline at end of file diff --git a/python/paddle/v2/data_feeder.py b/python/paddle/v2/data_feeder.py index ba77fecf21eecf9115cc1b20720383b790294eb0..bda8e22fd282f8ff4a820e4ecb6b3bb421d57890 100644 --- a/python/paddle/v2/data_feeder.py +++ b/python/paddle/v2/data_feeder.py @@ -85,6 +85,9 @@ class DataFeeder(DataProviderConverter): input_types.append(each[1]) DataProviderConverter.__init__(self, input_types) + def __len__(self): + return len(self.input_names) + def convert(self, dat, argument=None): """ :param dat: A list of mini-batch data. Each sample is a list or tuple diff --git a/python/paddle/v2/dataset/movielens.py b/python/paddle/v2/dataset/movielens.py index dc65e8f8b6f04b078a3449c622478095086cecbe..25fd8227da2f219d75c6b830e65627ecf35be453 100644 --- a/python/paddle/v2/dataset/movielens.py +++ b/python/paddle/v2/dataset/movielens.py @@ -23,7 +23,12 @@ import re import random import functools -__all__ = ['train_creator', 'test_creator'] +__all__ = [ + 'train', 'test', 'get_movie_title_dict', 'max_movie_id', 'max_user_id', + 'age_table', 'movie_categories', 'max_job_id', 'user_info', 'movie_info' +] + +age_table = [1, 18, 25, 35, 45, 50, 56] class MovieInfo(object): @@ -38,17 +43,32 @@ class MovieInfo(object): [MOVIE_TITLE_DICT[w.lower()] for w in self.title.split()] ] + def __str__(self): + return "" % ( + self.index, self.title, self.categories) + + def __repr__(self): + return self.__str__() + class UserInfo(object): def __init__(self, index, gender, age, job_id): self.index = int(index) self.is_male = gender == 'M' - self.age = [1, 18, 25, 35, 45, 50, 56].index(int(age)) + self.age = age_table.index(int(age)) self.job_id = int(job_id) def value(self): return [self.index, 0 if self.is_male else 1, self.age, self.job_id] + def __str__(self): + return "" % ( + self.index, "M" + if self.is_male else "F", age_table[self.age], self.job_id) + + def __repr__(self): + return str(self) + MOVIE_INFO = None MOVIE_TITLE_DICT = None @@ -59,7 +79,8 @@ USER_INFO = None def __initialize_meta_info__(): fn = download( url='http://files.grouplens.org/datasets/movielens/ml-1m.zip', - md5='c4d9eecfca2ab87c1945afe126590906') + module_name='movielens', + md5sum='c4d9eecfca2ab87c1945afe126590906') global MOVIE_INFO if MOVIE_INFO is None: pattern = re.compile(r'^(.*)\((\d+)\)$') @@ -122,14 +143,63 @@ def __reader_creator__(**kwargs): return lambda: __reader__(**kwargs) -train_creator = functools.partial(__reader_creator__, is_test=False) -test_creator = functools.partial(__reader_creator__, is_test=True) +train = functools.partial(__reader_creator__, is_test=False) +test = functools.partial(__reader_creator__, is_test=True) + + +def get_movie_title_dict(): + __initialize_meta_info__() + return MOVIE_TITLE_DICT + + +def __max_index_info__(a, b): + if a.index > b.index: + return a + else: + return b + + +def max_movie_id(): + __initialize_meta_info__() + return reduce(__max_index_info__, MOVIE_INFO.viewvalues()).index + + +def max_user_id(): + __initialize_meta_info__() + return reduce(__max_index_info__, USER_INFO.viewvalues()).index + + +def __max_job_id_impl__(a, b): + if a.job_id > b.job_id: + return a + else: + return b + + +def max_job_id(): + __initialize_meta_info__() + return reduce(__max_job_id_impl__, USER_INFO.viewvalues()).job_id + + +def movie_categories(): + __initialize_meta_info__() + return CATEGORIES_DICT + + +def user_info(): + __initialize_meta_info__() + return USER_INFO + + +def movie_info(): + __initialize_meta_info__() + return MOVIE_INFO def unittest(): - for train_count, _ in enumerate(train_creator()()): + for train_count, _ in enumerate(train()()): pass - for test_count, _ in enumerate(test_creator()()): + for test_count, _ in enumerate(test()()): pass print train_count, test_count diff --git a/python/paddle/v2/inference.py b/python/paddle/v2/inference.py index 7d889bce7fe5ded22755a527575595f375691df4..35949622abb7a704b0b23d4f9457738a1177a795 100644 --- a/python/paddle/v2/inference.py +++ b/python/paddle/v2/inference.py @@ -1,9 +1,9 @@ +import numpy import py_paddle.swig_paddle as api - +import collections import topology +import minibatch from data_feeder import DataFeeder -import itertools -import numpy __all__ = ['infer'] @@ -21,8 +21,33 @@ class Inference(object): self.__gradient_machine__ = gm self.__data_types__ = topo.data_type() - def iter_infer(self, reader, feeding=None): + def iter_infer(self, input=None, batch_size=None, reader=None, + feeding=None): feeder = DataFeeder(self.__data_types__, feeding) + if reader is None: + assert input is not None and isinstance(input, collections.Iterable) + if not isinstance(input, collections.Iterable): + raise TypeError("When reader is None, input should be whole " + "inference data and should be iterable") + + if batch_size is None: + if not hasattr(input, '__len__'): + raise ValueError("Should set batch size when input data " + "don't contain length.") + batch_size = len(input) + + def __reader_impl__(): + for each_sample in input: + if len(feeder) == 1: + yield [each_sample] + else: + yield each_sample + + reader = minibatch.batch(__reader_impl__, batch_size=batch_size) + else: + if input is not None: + raise ValueError("User should set either input or reader, " + "should not set them both.") self.__gradient_machine__.start() for data_batch in reader(): yield self.__gradient_machine__.forwardTest(feeder(data_batch)) @@ -46,6 +71,52 @@ class Inference(object): return retv -def infer(output, parameters, reader, feeding=None, field='value'): +def infer(output, + parameters, + input=None, + batch_size=None, + reader=None, + feeding=None, + field='value'): + """ + Infer a neural network by given neural network output and parameters. The + user should pass either a batch of input data or reader method. + + Example usages: + + .. code-block:: python + + result = paddle.infer(prediction, parameters, input=SomeData, + batch_size=32) + print result + + :param output: output of the neural network that would be inferred + :type output: paddle.v2.config_base.Layer + :param parameters: parameters of the neural network. + :type parameters: paddle.v2.parameters.Parameters + :param input: input data batch. Should be a python iterable object, and each + element is the data batch. + :type input: collections.Iterable + :param batch_size: the batch size when perform inference. Default is the + length of input. + :type batch_size: int + :param reader: input data reader creator in batch. If this field is set, the + `input` and `batch_size` will be ignored. + :type reader: callable + :param feeding: Reader dictionary. Default could generate from input + value. + :param field: The prediction field. It should in [`value`, `ids`]. `value` + means return the prediction probabilities, `ids` means return + the prediction labels. Default is `value` + :type field: str + :return: a numpy array + :rtype: numpy.ndarray + """ + inferer = Inference(output=output, parameters=parameters) - return inferer.infer(field=field, reader=reader, feeding=feeding) + return inferer.infer( + field=field, + input=input, + batch_size=batch_size, + reader=reader, + feeding=feeding)