diff --git a/demo/recommendation/api_train_v2.py b/demo/recommendation/api_train_v2.py index c726fa5bdb31f87377c15a709121fa57ad3c73c2..3ff86c2c45e05e33135cd2f059a36edb3b5f94b8 100644 --- a/demo/recommendation/api_train_v2.py +++ b/demo/recommendation/api_train_v2.py @@ -2,6 +2,7 @@ import paddle.v2 as paddle def main(): + paddle.init(use_gpu=False, trainer_count=3) movie_title_dict = paddle.dataset.movielens.get_movie_title_dict() uid = paddle.layer.data( name='user_id', @@ -17,10 +18,17 @@ def main(): name='age_id', type=paddle.data_type.integer_value( len(paddle.dataset.movielens.age_table))) - usr_age_emb = paddle.embedding(input=usr_age_id, size=16) + usr_age_emb = paddle.layer.embedding(input=usr_age_id, size=16) - usr_combined_features = paddle.fc( - input=[usr_emb, usr_gender_emb, usr_age_emb], + usr_job_id = paddle.layer.data( + name='job_id', + type=paddle.data_type.integer_value(paddle.dataset.movielens.max_job_id( + ) + 1)) + + usr_job_emb = paddle.layer.embedding(input=usr_job_id, size=16) + + usr_combined_features = paddle.layer.fc( + input=[usr_emb, usr_gender_emb, usr_age_emb, usr_job_emb], size=200, act=paddle.activation.Tanh()) @@ -30,12 +38,59 @@ def main(): paddle.dataset.movielens.max_movie_id() + 1)) mov_emb = paddle.layer.embedding(input=mov_id, size=32) + mov_categories = paddle.layer.data( + name='category_id', + type=paddle.data_type.sparse_binary_vector( + len(paddle.dataset.movielens.movie_categories()))) + + mov_categories_hidden = paddle.layer.fc(input=mov_categories, size=32) + mov_title_id = paddle.layer.data( name='movie_title', - type=paddle.data_type.integer_value(len(movie_title_dict))) - mov_title_emb = paddle.embedding(input=mov_title_id, size=32) - with paddle.layer.mixed() as mixed: - pass + type=paddle.data_type.integer_value_sequence(len(movie_title_dict))) + mov_title_emb = paddle.layer.embedding(input=mov_title_id, size=32) + mov_title_conv = paddle.networks.sequence_conv_pool( + input=mov_title_emb, hidden_size=32, context_len=3) + + mov_combined_features = paddle.layer.fc( + input=[mov_emb, mov_categories_hidden, mov_title_conv], + size=200, + act=paddle.activation.Tanh()) + + inference = paddle.layer.cos_sim( + a=usr_combined_features, b=mov_combined_features, size=1, scale=5) + cost = paddle.layer.regression_cost( + input=inference, + label=paddle.layer.data( + name='score', type=paddle.data_type.dense_vector(1))) + + parameters = paddle.parameters.create(cost) + + trainer = paddle.trainer.SGD(cost=cost, + parameters=parameters, + update_equation=paddle.optimizer.Adam( + learning_rate=1e-4)) + + def event_handler(event): + if isinstance(event, paddle.event.EndIteration): + if event.batch_id % 100 == 0: + print "Pass %d Batch %d Cost %.2f" % ( + event.pass_id, event.batch_id, event.cost) + + trainer.train( + reader=paddle.reader.batched( + paddle.dataset.movielens.train(), batch_size=256), + event_handler=event_handler, + reader_dict={ + 'user_id': 0, + 'gender_id': 1, + 'age_id': 2, + 'job_id': 3, + 'movie_id': 4, + 'category_id': 5, + 'movie_title': 6, + 'score': 7 + }) if __name__ == '__main__': diff --git a/python/paddle/v2/dataset/movielens.py b/python/paddle/v2/dataset/movielens.py index 6efe42adbaff98c2e63b6e7e3ee099c90b45bf2c..058400502fc31f789dac5cdd023802ec1524d978 100644 --- a/python/paddle/v2/dataset/movielens.py +++ b/python/paddle/v2/dataset/movielens.py @@ -6,7 +6,7 @@ import functools __all__ = [ 'train', 'test', 'get_movie_title_dict', 'max_movie_id', 'max_user_id', - 'age_table' + 'age_table', 'movie_categories', 'max_job_id' ] age_table = [1, 18, 25, 35, 45, 50, 56] @@ -135,6 +135,23 @@ def max_user_id(): return reduce(__max_index_info__, USER_INFO.viewvalues()).index +def __max_job_id_impl__(a, b): + if a.job_id > b.job_id: + return a + else: + return b + + +def max_job_id(): + __initialize_meta_info__() + return reduce(__max_job_id_impl__, USER_INFO.viewvalues()).job_id + + +def movie_categories(): + __initialize_meta_info__() + return CATEGORIES_DICT + + def unittest(): for train_count, _ in enumerate(train()()): pass