提交 aaa2a1f8 编写于 作者: H helinwang 提交者: GitHub

Merge pull request #1501 from reyoung/feature/recommendation_v2_api

Feature/recommendation v2 api
...@@ -92,12 +92,8 @@ def main(): ...@@ -92,12 +92,8 @@ def main():
def event_handler(event): def event_handler(event):
if isinstance(event, paddle.event.EndIteration): if isinstance(event, paddle.event.EndIteration):
if event.batch_id % 1000 == 0: if event.batch_id % 1000 == 0:
result = trainer.test(reader=paddle.batch( print "Pass %d, Batch %d, Cost %f, %s" % (
paddle.dataset.mnist.test(), batch_size=256)) event.pass_id, event.batch_id, event.cost, event.metrics)
print "Pass %d, Batch %d, Cost %f, %s, Testing metrics %s" % (
event.pass_id, event.batch_id, event.cost, event.metrics,
result.metrics)
with gzip.open('params.tar.gz', 'w') as f: with gzip.open('params.tar.gz', 'w') as f:
parameters.to_tar(f) parameters.to_tar(f)
...@@ -123,17 +119,16 @@ def main(): ...@@ -123,17 +119,16 @@ def main():
print 'Best pass is %s, testing Avgcost is %s' % (best[0], best[1]) print 'Best pass is %s, testing Avgcost is %s' % (best[0], best[1])
print 'The classification accuracy is %.2f%%' % (100 - float(best[2]) * 100) print 'The classification accuracy is %.2f%%' % (100 - float(best[2]) * 100)
test_creator = paddle.dataset.mnist.test()
test_data = []
for item in test_creator():
test_data.append(item[0])
if len(test_data) == 100:
break
# output is a softmax layer. It returns probabilities. # output is a softmax layer. It returns probabilities.
# Shape should be (100, 10) # Shape should be (100, 10)
probs = paddle.infer( probs = paddle.infer(output=predict, parameters=parameters, input=test_data)
output=predict,
parameters=parameters,
reader=paddle.batch(
paddle.reader.firstn(
paddle.reader.map_readers(lambda item: (item[0], ),
paddle.dataset.mnist.test()),
n=100),
batch_size=32))
print probs.shape print probs.shape
......
import paddle.v2 as paddle
import cPickle
import copy
def main():
paddle.init(use_gpu=False)
movie_title_dict = paddle.dataset.movielens.get_movie_title_dict()
uid = paddle.layer.data(
name='user_id',
type=paddle.data_type.integer_value(
paddle.dataset.movielens.max_user_id() + 1))
usr_emb = paddle.layer.embedding(input=uid, size=32)
usr_gender_id = paddle.layer.data(
name='gender_id', type=paddle.data_type.integer_value(2))
usr_gender_emb = paddle.layer.embedding(input=usr_gender_id, size=16)
usr_age_id = paddle.layer.data(
name='age_id',
type=paddle.data_type.integer_value(
len(paddle.dataset.movielens.age_table)))
usr_age_emb = paddle.layer.embedding(input=usr_age_id, size=16)
usr_job_id = paddle.layer.data(
name='job_id',
type=paddle.data_type.integer_value(paddle.dataset.movielens.max_job_id(
) + 1))
usr_job_emb = paddle.layer.embedding(input=usr_job_id, size=16)
usr_combined_features = paddle.layer.fc(
input=[usr_emb, usr_gender_emb, usr_age_emb, usr_job_emb],
size=200,
act=paddle.activation.Tanh())
mov_id = paddle.layer.data(
name='movie_id',
type=paddle.data_type.integer_value(
paddle.dataset.movielens.max_movie_id() + 1))
mov_emb = paddle.layer.embedding(input=mov_id, size=32)
mov_categories = paddle.layer.data(
name='category_id',
type=paddle.data_type.sparse_binary_vector(
len(paddle.dataset.movielens.movie_categories())))
mov_categories_hidden = paddle.layer.fc(input=mov_categories, size=32)
mov_title_id = paddle.layer.data(
name='movie_title',
type=paddle.data_type.integer_value_sequence(len(movie_title_dict)))
mov_title_emb = paddle.layer.embedding(input=mov_title_id, size=32)
mov_title_conv = paddle.networks.sequence_conv_pool(
input=mov_title_emb, hidden_size=32, context_len=3)
mov_combined_features = paddle.layer.fc(
input=[mov_emb, mov_categories_hidden, mov_title_conv],
size=200,
act=paddle.activation.Tanh())
inference = paddle.layer.cos_sim(
a=usr_combined_features, b=mov_combined_features, size=1, scale=5)
cost = paddle.layer.regression_cost(
input=inference,
label=paddle.layer.data(
name='score', type=paddle.data_type.dense_vector(1)))
parameters = paddle.parameters.create(cost)
trainer = paddle.trainer.SGD(cost=cost,
parameters=parameters,
update_equation=paddle.optimizer.Adam(
learning_rate=1e-4))
feeding = {
'user_id': 0,
'gender_id': 1,
'age_id': 2,
'job_id': 3,
'movie_id': 4,
'category_id': 5,
'movie_title': 6,
'score': 7
}
def event_handler(event):
if isinstance(event, paddle.event.EndIteration):
if event.batch_id % 100 == 0:
print "Pass %d Batch %d Cost %.2f" % (
event.pass_id, event.batch_id, event.cost)
trainer.train(
reader=paddle.batch(
paddle.reader.shuffle(
paddle.dataset.movielens.train(), buf_size=8192),
batch_size=256),
event_handler=event_handler,
feeding=feeding,
num_passes=1)
user_id = 234
movie_id = 345
user = paddle.dataset.movielens.user_info()[user_id]
movie = paddle.dataset.movielens.movie_info()[movie_id]
feature = user.value() + movie.value()
def reader():
yield feature
infer_dict = copy.copy(feeding)
del infer_dict['score']
prediction = paddle.infer(
output=inference,
parameters=parameters,
reader=paddle.batch(
reader, batch_size=32),
feeding=infer_dict)
print(prediction + 5) / 2
if __name__ == '__main__':
main()
...@@ -2,6 +2,7 @@ ...@@ -2,6 +2,7 @@
Trainer API Trainer API
########### ###########
========== ==========
Parameters Parameters
========== ==========
...@@ -24,3 +25,10 @@ Event ...@@ -24,3 +25,10 @@ Event
.. automodule:: paddle.v2.event .. automodule:: paddle.v2.event
:members: :members:
=========
Inference
=========
.. autofunction:: paddle.v2.infer
\ No newline at end of file
...@@ -85,6 +85,9 @@ class DataFeeder(DataProviderConverter): ...@@ -85,6 +85,9 @@ class DataFeeder(DataProviderConverter):
input_types.append(each[1]) input_types.append(each[1])
DataProviderConverter.__init__(self, input_types) DataProviderConverter.__init__(self, input_types)
def __len__(self):
return len(self.input_names)
def convert(self, dat, argument=None): def convert(self, dat, argument=None):
""" """
:param dat: A list of mini-batch data. Each sample is a list or tuple :param dat: A list of mini-batch data. Each sample is a list or tuple
......
...@@ -23,7 +23,12 @@ import re ...@@ -23,7 +23,12 @@ import re
import random import random
import functools import functools
__all__ = ['train_creator', 'test_creator'] __all__ = [
'train', 'test', 'get_movie_title_dict', 'max_movie_id', 'max_user_id',
'age_table', 'movie_categories', 'max_job_id', 'user_info', 'movie_info'
]
age_table = [1, 18, 25, 35, 45, 50, 56]
class MovieInfo(object): class MovieInfo(object):
...@@ -38,17 +43,32 @@ class MovieInfo(object): ...@@ -38,17 +43,32 @@ class MovieInfo(object):
[MOVIE_TITLE_DICT[w.lower()] for w in self.title.split()] [MOVIE_TITLE_DICT[w.lower()] for w in self.title.split()]
] ]
def __str__(self):
return "<MovieInfo id(%d), title(%s), categories(%s)>" % (
self.index, self.title, self.categories)
def __repr__(self):
return self.__str__()
class UserInfo(object): class UserInfo(object):
def __init__(self, index, gender, age, job_id): def __init__(self, index, gender, age, job_id):
self.index = int(index) self.index = int(index)
self.is_male = gender == 'M' self.is_male = gender == 'M'
self.age = [1, 18, 25, 35, 45, 50, 56].index(int(age)) self.age = age_table.index(int(age))
self.job_id = int(job_id) self.job_id = int(job_id)
def value(self): def value(self):
return [self.index, 0 if self.is_male else 1, self.age, self.job_id] return [self.index, 0 if self.is_male else 1, self.age, self.job_id]
def __str__(self):
return "<UserInfo id(%d), gender(%s), age(%d), job(%d)>" % (
self.index, "M"
if self.is_male else "F", age_table[self.age], self.job_id)
def __repr__(self):
return str(self)
MOVIE_INFO = None MOVIE_INFO = None
MOVIE_TITLE_DICT = None MOVIE_TITLE_DICT = None
...@@ -59,7 +79,8 @@ USER_INFO = None ...@@ -59,7 +79,8 @@ USER_INFO = None
def __initialize_meta_info__(): def __initialize_meta_info__():
fn = download( fn = download(
url='http://files.grouplens.org/datasets/movielens/ml-1m.zip', url='http://files.grouplens.org/datasets/movielens/ml-1m.zip',
md5='c4d9eecfca2ab87c1945afe126590906') module_name='movielens',
md5sum='c4d9eecfca2ab87c1945afe126590906')
global MOVIE_INFO global MOVIE_INFO
if MOVIE_INFO is None: if MOVIE_INFO is None:
pattern = re.compile(r'^(.*)\((\d+)\)$') pattern = re.compile(r'^(.*)\((\d+)\)$')
...@@ -122,14 +143,63 @@ def __reader_creator__(**kwargs): ...@@ -122,14 +143,63 @@ def __reader_creator__(**kwargs):
return lambda: __reader__(**kwargs) return lambda: __reader__(**kwargs)
train_creator = functools.partial(__reader_creator__, is_test=False) train = functools.partial(__reader_creator__, is_test=False)
test_creator = functools.partial(__reader_creator__, is_test=True) test = functools.partial(__reader_creator__, is_test=True)
def get_movie_title_dict():
__initialize_meta_info__()
return MOVIE_TITLE_DICT
def __max_index_info__(a, b):
if a.index > b.index:
return a
else:
return b
def max_movie_id():
__initialize_meta_info__()
return reduce(__max_index_info__, MOVIE_INFO.viewvalues()).index
def max_user_id():
__initialize_meta_info__()
return reduce(__max_index_info__, USER_INFO.viewvalues()).index
def __max_job_id_impl__(a, b):
if a.job_id > b.job_id:
return a
else:
return b
def max_job_id():
__initialize_meta_info__()
return reduce(__max_job_id_impl__, USER_INFO.viewvalues()).job_id
def movie_categories():
__initialize_meta_info__()
return CATEGORIES_DICT
def user_info():
__initialize_meta_info__()
return USER_INFO
def movie_info():
__initialize_meta_info__()
return MOVIE_INFO
def unittest(): def unittest():
for train_count, _ in enumerate(train_creator()()): for train_count, _ in enumerate(train()()):
pass pass
for test_count, _ in enumerate(test_creator()()): for test_count, _ in enumerate(test()()):
pass pass
print train_count, test_count print train_count, test_count
......
import numpy
import py_paddle.swig_paddle as api import py_paddle.swig_paddle as api
import collections
import topology import topology
import minibatch
from data_feeder import DataFeeder from data_feeder import DataFeeder
import itertools
import numpy
__all__ = ['infer'] __all__ = ['infer']
...@@ -21,8 +21,33 @@ class Inference(object): ...@@ -21,8 +21,33 @@ class Inference(object):
self.__gradient_machine__ = gm self.__gradient_machine__ = gm
self.__data_types__ = topo.data_type() self.__data_types__ = topo.data_type()
def iter_infer(self, reader, feeding=None): def iter_infer(self, input=None, batch_size=None, reader=None,
feeding=None):
feeder = DataFeeder(self.__data_types__, feeding) feeder = DataFeeder(self.__data_types__, feeding)
if reader is None:
assert input is not None and isinstance(input, collections.Iterable)
if not isinstance(input, collections.Iterable):
raise TypeError("When reader is None, input should be whole "
"inference data and should be iterable")
if batch_size is None:
if not hasattr(input, '__len__'):
raise ValueError("Should set batch size when input data "
"don't contain length.")
batch_size = len(input)
def __reader_impl__():
for each_sample in input:
if len(feeder) == 1:
yield [each_sample]
else:
yield each_sample
reader = minibatch.batch(__reader_impl__, batch_size=batch_size)
else:
if input is not None:
raise ValueError("User should set either input or reader, "
"should not set them both.")
self.__gradient_machine__.start() self.__gradient_machine__.start()
for data_batch in reader(): for data_batch in reader():
yield self.__gradient_machine__.forwardTest(feeder(data_batch)) yield self.__gradient_machine__.forwardTest(feeder(data_batch))
...@@ -46,6 +71,52 @@ class Inference(object): ...@@ -46,6 +71,52 @@ class Inference(object):
return retv return retv
def infer(output, parameters, reader, feeding=None, field='value'): def infer(output,
parameters,
input=None,
batch_size=None,
reader=None,
feeding=None,
field='value'):
"""
Infer a neural network by given neural network output and parameters. The
user should pass either a batch of input data or reader method.
Example usages:
.. code-block:: python
result = paddle.infer(prediction, parameters, input=SomeData,
batch_size=32)
print result
:param output: output of the neural network that would be inferred
:type output: paddle.v2.config_base.Layer
:param parameters: parameters of the neural network.
:type parameters: paddle.v2.parameters.Parameters
:param input: input data batch. Should be a python iterable object, and each
element is the data batch.
:type input: collections.Iterable
:param batch_size: the batch size when perform inference. Default is the
length of input.
:type batch_size: int
:param reader: input data reader creator in batch. If this field is set, the
`input` and `batch_size` will be ignored.
:type reader: callable
:param feeding: Reader dictionary. Default could generate from input
value.
:param field: The prediction field. It should in [`value`, `ids`]. `value`
means return the prediction probabilities, `ids` means return
the prediction labels. Default is `value`
:type field: str
:return: a numpy array
:rtype: numpy.ndarray
"""
inferer = Inference(output=output, parameters=parameters) inferer = Inference(output=output, parameters=parameters)
return inferer.infer(field=field, reader=reader, feeding=feeding) return inferer.infer(
field=field,
input=input,
batch_size=batch_size,
reader=reader,
feeding=feeding)
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册