test_recommender_system.py 6.8 KB
Newer Older
1
import paddle.v2 as paddle
Q
Qiao Longfei 已提交
2 3 4 5
import paddle.v2.fluid.layers as layers
import paddle.v2.fluid.nets as nets
import paddle.v2.fluid.core as core
import paddle.v2.fluid.optimizer as optimizer
6
import paddle.v2.fluid.framework as framework
Q
Qiao Longfei 已提交
7
from paddle.v2.fluid.executor import Executor
8 9 10

import numpy as np

11 12
IS_SPARSE = True
USE_GPU = False
13 14 15 16 17 18 19 20 21 22 23 24
BATCH_SIZE = 256


def get_usr_combined_features():
    # FIXME(dzh) : old API integer_value(10) may has range check.
    # currently we don't have user configurated check.

    USR_DICT_SIZE = paddle.dataset.movielens.max_user_id() + 1

    uid = layers.data(
        name='user_id',
        shape=[1],
25
        data_type='int64')
26 27 28 29 30 31

    usr_emb = layers.embedding(
        input=uid,
        data_type='float32',
        size=[USR_DICT_SIZE, 32],
        param_attr={'name': 'user_table'},
32
        is_sparse=IS_SPARSE)
33 34

    usr_fc = layers.fc(input=usr_emb,
35
                       size=32)
36 37 38 39 40 41

    USR_GENDER_DICT_SIZE = 2

    usr_gender_id = layers.data(
        name='gender_id',
        shape=[1],
42
        data_type='int64')
43 44 45 46 47

    usr_gender_emb = layers.embedding(
        input=usr_gender_id,
        size=[USR_GENDER_DICT_SIZE, 16],
        param_attr={'name': 'gender_table'},
48
        is_sparse=IS_SPARSE)
49 50

    usr_gender_fc = layers.fc(input=usr_gender_emb,
51
                              size=16)
52 53 54 55 56

    USR_AGE_DICT_SIZE = len(paddle.dataset.movielens.age_table)
    usr_age_id = layers.data(
        name='age_id',
        shape=[1],
57
        data_type="int64")
58 59 60 61

    usr_age_emb = layers.embedding(
        input=usr_age_id,
        size=[USR_AGE_DICT_SIZE, 16],
62 63
        is_sparse=IS_SPARSE,
        param_attr={'name': 'age_table'})
64 65

    usr_age_fc = layers.fc(input=usr_age_emb,
66
                           size=16)
67 68 69 70 71

    USR_JOB_DICT_SIZE = paddle.dataset.movielens.max_job_id() + 1
    usr_job_id = layers.data(
        name='job_id',
        shape=[1],
72
        data_type="int64")
73 74 75 76 77

    usr_job_emb = layers.embedding(
        input=usr_job_id,
        size=[USR_JOB_DICT_SIZE, 16],
        param_attr={'name': 'job_table'},
78
        is_sparse=IS_SPARSE)
79 80

    usr_job_fc = layers.fc(input=usr_job_emb,
81
                           size=16)
82 83 84

    concat_embed = layers.concat(
        input=[usr_fc, usr_gender_fc, usr_age_fc, usr_job_fc],
85
        axis=1)
86 87 88

    usr_combined_features = layers.fc(input=concat_embed,
                                      size=200,
89
                                      act="tanh")
90 91 92 93 94 95 96 97 98 99 100

    return usr_combined_features


def get_mov_combined_features():

    MOV_DICT_SIZE = paddle.dataset.movielens.max_movie_id() + 1

    mov_id = layers.data(
        name='movie_id',
        shape=[1],
101
        data_type='int64')
102 103 104 105 106 107

    mov_emb = layers.embedding(
        input=mov_id,
        data_type='float32',
        size=[MOV_DICT_SIZE, 32],
        param_attr={'name': 'movie_table'},
108
        is_sparse=IS_SPARSE)
109 110

    mov_fc = layers.fc(input=mov_emb,
111
                       size=32)
112 113 114 115 116 117

    CATEGORY_DICT_SIZE = len(paddle.dataset.movielens.movie_categories())

    category_id = layers.data(
        name='category_id',
        shape=[1],
118
        data_type='int64')
119 120 121 122

    mov_categories_emb = layers.embedding(
        input=category_id,
        size=[CATEGORY_DICT_SIZE, 32],
123
        is_sparse=IS_SPARSE)
124 125 126

    mov_categories_hidden = layers.sequence_pool(
        input=mov_categories_emb,
127
        pool_type="sum")
128 129 130 131 132 133

    MOV_TITLE_DICT_SIZE = len(paddle.dataset.movielens.get_movie_title_dict())

    mov_title_id = layers.data(
        name='movie_title',
        shape=[1],
134
        data_type='int64')
135 136 137 138

    mov_title_emb = layers.embedding(
        input=mov_title_id,
        size=[MOV_TITLE_DICT_SIZE, 32],
139
        is_sparse=IS_SPARSE)
140 141 142 143 144 145

    mov_title_conv = nets.sequence_conv_pool(
        input=mov_title_emb,
        num_filters=32,
        filter_size=3,
        act="tanh",
146
        pool_type="sum")
147 148 149

    concat_embed = layers.concat(
        input=[mov_fc, mov_categories_hidden, mov_title_conv],
150
        axis=1)
151 152 153 154

    # FIXME(dzh) : need tanh operator
    mov_combined_features = layers.fc(input=concat_embed,
                                      size=200,
155
                                      act="tanh")
156 157 158 159 160 161 162 163 164 165 166

    return mov_combined_features


def model():
    usr_combined_features = get_usr_combined_features()
    mov_combined_features = get_mov_combined_features()

    # need cos sim
    inference = layers.cos_sim(
        X=usr_combined_features,
167
        Y=mov_combined_features)
168 169 170 171

    label = layers.data(
        name='score',
        shape=[1],
172
        data_type='float32')
173 174 175

    square_cost = layers.square_error_cost(
        input=inference,
176
        label=label)
177

178
    avg_cost = layers.mean(x=square_cost)
179 180 181 182 183 184 185

    return avg_cost


def main():
    cost = model()
    sgd_optimizer = optimizer.SGDOptimizer(learning_rate=0.2)
186
    opts = sgd_optimizer.minimize(cost)
187

188
    if USE_GPU:
189 190 191 192 193
        place = core.GPUPlace(0)
    else:
        place = core.CPUPlace()

    exe = Executor(place)
194
    exe.run(framework.default_startup_program())
195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242

    train_reader = paddle.batch(
        paddle.reader.shuffle(
            paddle.dataset.movielens.train(), buf_size=8192),
        batch_size=BATCH_SIZE)

    feeding = {
        'user_id': 0,
        'gender_id': 1,
        'age_id': 2,
        'job_id': 3,
        'movie_id': 4,
        'category_id': 5,
        'movie_title': 6,
        'score': 7
    }

    def func_feed(feeding, data):
        feed_tensors = {}
        for (key, idx) in feeding.iteritems():
            tensor = core.LoDTensor()
            if key != "category_id" and key != "movie_title":
                if key == "score":
                    numpy_data = np.array(map(lambda x: x[idx], data)).astype(
                        "float32")
                else:
                    numpy_data = np.array(map(lambda x: x[idx], data)).astype(
                        "int64")
            else:
                numpy_data = map(lambda x: np.array(x[idx]).astype("int64"),
                                 data)
                lod_info = [len(item) for item in numpy_data]
                offset = 0
                lod = [offset]
                for item in lod_info:
                    offset += item
                    lod.append(offset)
                numpy_data = np.concatenate(numpy_data, axis=0)
                tensor.set_lod([lod])

            numpy_data = numpy_data.reshape([numpy_data.shape[0], 1])
            tensor.set(numpy_data, place)
            feed_tensors[key] = tensor
        return feed_tensors

    PASS_NUM = 100
    for pass_id in range(PASS_NUM):
        for data in train_reader():
243
            outs = exe.run(framework.default_main_program(),
244 245 246
                           feed=func_feed(feeding, data),
                           fetch_list=[cost])
            out = np.array(outs[0])
247 248
            if out[0] < 6.0:
                # if avg cost less than 6.0, we think our code is good.
249 250 251 252
                exit(0)


main()