test_recommender_system.py 6.2 KB
Newer Older
Q
Qiao Longfei 已提交
1
import numpy as np
2
import paddle.v2 as paddle
Q
Qiao Longfei 已提交
3
import paddle.v2.fluid.core as core
4
import paddle.v2.fluid.framework as framework
Q
Qiao Longfei 已提交
5 6
import paddle.v2.fluid.layers as layers
import paddle.v2.fluid.nets as nets
Q
Qiao Longfei 已提交
7
from paddle.v2.fluid.executor import Executor
Q
Qiao Longfei 已提交
8
from paddle.v2.fluid.optimizer import SGDOptimizer
9

10 11
IS_SPARSE = True
USE_GPU = False
12 13 14 15 16 17 18 19 20
BATCH_SIZE = 256


def get_usr_combined_features():
    # FIXME(dzh) : old API integer_value(10) may has range check.
    # currently we don't have user configurated check.

    USR_DICT_SIZE = paddle.dataset.movielens.max_user_id() + 1

F
fengjiayi 已提交
21
    uid = layers.data(name='user_id', shape=[1], dtype='int64')
22 23 24

    usr_emb = layers.embedding(
        input=uid,
F
fengjiayi 已提交
25
        dtype='float32',
26 27
        size=[USR_DICT_SIZE, 32],
        param_attr={'name': 'user_table'},
28
        is_sparse=IS_SPARSE)
29

Q
Qiao Longfei 已提交
30
    usr_fc = layers.fc(input=usr_emb, size=32)
31 32 33

    USR_GENDER_DICT_SIZE = 2

F
fengjiayi 已提交
34
    usr_gender_id = layers.data(name='gender_id', shape=[1], dtype='int64')
35 36 37 38 39

    usr_gender_emb = layers.embedding(
        input=usr_gender_id,
        size=[USR_GENDER_DICT_SIZE, 16],
        param_attr={'name': 'gender_table'},
40
        is_sparse=IS_SPARSE)
41

Q
Qiao Longfei 已提交
42
    usr_gender_fc = layers.fc(input=usr_gender_emb, size=16)
43 44

    USR_AGE_DICT_SIZE = len(paddle.dataset.movielens.age_table)
F
fengjiayi 已提交
45
    usr_age_id = layers.data(name='age_id', shape=[1], dtype="int64")
46 47 48 49

    usr_age_emb = layers.embedding(
        input=usr_age_id,
        size=[USR_AGE_DICT_SIZE, 16],
50 51
        is_sparse=IS_SPARSE,
        param_attr={'name': 'age_table'})
52

Q
Qiao Longfei 已提交
53
    usr_age_fc = layers.fc(input=usr_age_emb, size=16)
54 55

    USR_JOB_DICT_SIZE = paddle.dataset.movielens.max_job_id() + 1
F
fengjiayi 已提交
56
    usr_job_id = layers.data(name='job_id', shape=[1], dtype="int64")
57 58 59 60 61

    usr_job_emb = layers.embedding(
        input=usr_job_id,
        size=[USR_JOB_DICT_SIZE, 16],
        param_attr={'name': 'job_table'},
62
        is_sparse=IS_SPARSE)
63

Q
Qiao Longfei 已提交
64
    usr_job_fc = layers.fc(input=usr_job_emb, size=16)
65 66

    concat_embed = layers.concat(
Q
Qiao Longfei 已提交
67
        input=[usr_fc, usr_gender_fc, usr_age_fc, usr_job_fc], axis=1)
68

Q
Qiao Longfei 已提交
69
    usr_combined_features = layers.fc(input=concat_embed, size=200, act="tanh")
70 71 72 73 74 75 76 77

    return usr_combined_features


def get_mov_combined_features():

    MOV_DICT_SIZE = paddle.dataset.movielens.max_movie_id() + 1

F
fengjiayi 已提交
78
    mov_id = layers.data(name='movie_id', shape=[1], dtype='int64')
79 80 81

    mov_emb = layers.embedding(
        input=mov_id,
F
fengjiayi 已提交
82
        dtype='float32',
83 84
        size=[MOV_DICT_SIZE, 32],
        param_attr={'name': 'movie_table'},
85
        is_sparse=IS_SPARSE)
86

Q
Qiao Longfei 已提交
87
    mov_fc = layers.fc(input=mov_emb, size=32)
88 89 90

    CATEGORY_DICT_SIZE = len(paddle.dataset.movielens.movie_categories())

F
fengjiayi 已提交
91
    category_id = layers.data(name='category_id', shape=[1], dtype='int64')
92 93

    mov_categories_emb = layers.embedding(
Q
Qiao Longfei 已提交
94
        input=category_id, size=[CATEGORY_DICT_SIZE, 32], is_sparse=IS_SPARSE)
95 96

    mov_categories_hidden = layers.sequence_pool(
Q
Qiao Longfei 已提交
97
        input=mov_categories_emb, pool_type="sum")
98 99 100

    MOV_TITLE_DICT_SIZE = len(paddle.dataset.movielens.get_movie_title_dict())

F
fengjiayi 已提交
101
    mov_title_id = layers.data(name='movie_title', shape=[1], dtype='int64')
102 103

    mov_title_emb = layers.embedding(
Q
Qiao Longfei 已提交
104
        input=mov_title_id, size=[MOV_TITLE_DICT_SIZE, 32], is_sparse=IS_SPARSE)
105 106 107 108 109 110

    mov_title_conv = nets.sequence_conv_pool(
        input=mov_title_emb,
        num_filters=32,
        filter_size=3,
        act="tanh",
111
        pool_type="sum")
112 113

    concat_embed = layers.concat(
Q
Qiao Longfei 已提交
114
        input=[mov_fc, mov_categories_hidden, mov_title_conv], axis=1)
115 116

    # FIXME(dzh) : need tanh operator
Q
Qiao Longfei 已提交
117
    mov_combined_features = layers.fc(input=concat_embed, size=200, act="tanh")
118 119 120 121 122 123 124 125 126

    return mov_combined_features


def model():
    usr_combined_features = get_usr_combined_features()
    mov_combined_features = get_mov_combined_features()

    # need cos sim
Q
Qiao Longfei 已提交
127
    inference = layers.cos_sim(X=usr_combined_features, Y=mov_combined_features)
128

F
fengjiayi 已提交
129
    label = layers.data(name='score', shape=[1], dtype='float32')
130

Q
Qiao Longfei 已提交
131
    square_cost = layers.square_error_cost(input=inference, label=label)
132

133
    avg_cost = layers.mean(x=square_cost)
134 135 136 137 138 139

    return avg_cost


def main():
    cost = model()
Q
Qiao Longfei 已提交
140
    sgd_optimizer = SGDOptimizer(learning_rate=0.2)
141
    opts = sgd_optimizer.minimize(cost)
142

143
    if USE_GPU:
144 145 146 147 148
        place = core.GPUPlace(0)
    else:
        place = core.CPUPlace()

    exe = Executor(place)
149
    exe.run(framework.default_startup_program())
150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197

    train_reader = paddle.batch(
        paddle.reader.shuffle(
            paddle.dataset.movielens.train(), buf_size=8192),
        batch_size=BATCH_SIZE)

    feeding = {
        'user_id': 0,
        'gender_id': 1,
        'age_id': 2,
        'job_id': 3,
        'movie_id': 4,
        'category_id': 5,
        'movie_title': 6,
        'score': 7
    }

    def func_feed(feeding, data):
        feed_tensors = {}
        for (key, idx) in feeding.iteritems():
            tensor = core.LoDTensor()
            if key != "category_id" and key != "movie_title":
                if key == "score":
                    numpy_data = np.array(map(lambda x: x[idx], data)).astype(
                        "float32")
                else:
                    numpy_data = np.array(map(lambda x: x[idx], data)).astype(
                        "int64")
            else:
                numpy_data = map(lambda x: np.array(x[idx]).astype("int64"),
                                 data)
                lod_info = [len(item) for item in numpy_data]
                offset = 0
                lod = [offset]
                for item in lod_info:
                    offset += item
                    lod.append(offset)
                numpy_data = np.concatenate(numpy_data, axis=0)
                tensor.set_lod([lod])

            numpy_data = numpy_data.reshape([numpy_data.shape[0], 1])
            tensor.set(numpy_data, place)
            feed_tensors[key] = tensor
        return feed_tensors

    PASS_NUM = 100
    for pass_id in range(PASS_NUM):
        for data in train_reader():
198
            outs = exe.run(framework.default_main_program(),
199 200 201
                           feed=func_feed(feeding, data),
                           fetch_list=[cost])
            out = np.array(outs[0])
202 203
            if out[0] < 6.0:
                # if avg cost less than 6.0, we think our code is good.
204 205 206 207
                exit(0)


main()