test_recommender_system.py 9.4 KB
Newer Older
1 2 3 4 5 6
import paddle.v2 as paddle
import paddle.v2.framework.layers as layers
import paddle.v2.framework.nets as nets
import paddle.v2.framework.core as core
import paddle.v2.framework.optimizer as optimizer

7
from paddle.v2.framework.framework import Program, g_main_program
8 9 10 11
from paddle.v2.framework.executor import Executor

import numpy as np

12 13
startup_program = Program()
main_program = Program()
14 15 16 17 18 19 20 21 22 23 24 25 26 27 28
is_sparse = True
use_gpu = False
BATCH_SIZE = 256


def get_usr_combined_features():
    # FIXME(dzh) : old API integer_value(10) may has range check.
    # currently we don't have user configurated check.

    USR_DICT_SIZE = paddle.dataset.movielens.max_user_id() + 1

    uid = layers.data(
        name='user_id',
        shape=[1],
        data_type='int64',
29 30
        main_program=main_program,
        startup_program=startup_program)
31 32 33 34 35 36 37

    usr_emb = layers.embedding(
        input=uid,
        data_type='float32',
        size=[USR_DICT_SIZE, 32],
        param_attr={'name': 'user_table'},
        is_sparse=is_sparse,
38 39
        main_program=main_program,
        startup_program=startup_program)
40 41 42

    usr_fc = layers.fc(input=usr_emb,
                       size=32,
43 44
                       main_program=main_program,
                       startup_program=startup_program)
45 46 47 48 49 50 51

    USR_GENDER_DICT_SIZE = 2

    usr_gender_id = layers.data(
        name='gender_id',
        shape=[1],
        data_type='int64',
52 53
        main_program=main_program,
        startup_program=startup_program)
54 55 56 57 58 59

    usr_gender_emb = layers.embedding(
        input=usr_gender_id,
        size=[USR_GENDER_DICT_SIZE, 16],
        param_attr={'name': 'gender_table'},
        is_sparse=is_sparse,
60 61
        main_program=main_program,
        startup_program=startup_program)
62 63 64

    usr_gender_fc = layers.fc(input=usr_gender_emb,
                              size=16,
65 66
                              main_program=main_program,
                              startup_program=startup_program)
67 68 69 70 71 72

    USR_AGE_DICT_SIZE = len(paddle.dataset.movielens.age_table)
    usr_age_id = layers.data(
        name='age_id',
        shape=[1],
        data_type="int64",
73 74
        main_program=main_program,
        startup_program=startup_program)
75 76 77 78 79 80

    usr_age_emb = layers.embedding(
        input=usr_age_id,
        size=[USR_AGE_DICT_SIZE, 16],
        is_sparse=is_sparse,
        param_attr={'name': 'age_table'},
81 82
        main_program=main_program,
        startup_program=startup_program)
83 84 85

    usr_age_fc = layers.fc(input=usr_age_emb,
                           size=16,
86 87
                           main_program=main_program,
                           startup_program=startup_program)
88 89 90 91 92 93

    USR_JOB_DICT_SIZE = paddle.dataset.movielens.max_job_id() + 1
    usr_job_id = layers.data(
        name='job_id',
        shape=[1],
        data_type="int64",
94 95
        main_program=main_program,
        startup_program=startup_program)
96 97 98 99 100 101

    usr_job_emb = layers.embedding(
        input=usr_job_id,
        size=[USR_JOB_DICT_SIZE, 16],
        param_attr={'name': 'job_table'},
        is_sparse=is_sparse,
102 103
        main_program=main_program,
        startup_program=startup_program)
104 105 106

    usr_job_fc = layers.fc(input=usr_job_emb,
                           size=16,
107 108
                           main_program=main_program,
                           startup_program=startup_program)
109 110 111 112

    concat_embed = layers.concat(
        input=[usr_fc, usr_gender_fc, usr_age_fc, usr_job_fc],
        axis=1,
113 114
        main_program=main_program,
        startup_program=startup_program)
115 116 117 118

    usr_combined_features = layers.fc(input=concat_embed,
                                      size=200,
                                      act="tanh",
119 120
                                      main_program=main_program,
                                      startup_program=startup_program)
121 122 123 124 125 126 127 128 129 130 131 132

    return usr_combined_features


def get_mov_combined_features():

    MOV_DICT_SIZE = paddle.dataset.movielens.max_movie_id() + 1

    mov_id = layers.data(
        name='movie_id',
        shape=[1],
        data_type='int64',
133 134
        main_program=main_program,
        startup_program=startup_program)
135 136 137 138 139 140 141

    mov_emb = layers.embedding(
        input=mov_id,
        data_type='float32',
        size=[MOV_DICT_SIZE, 32],
        param_attr={'name': 'movie_table'},
        is_sparse=is_sparse,
142 143
        main_program=main_program,
        startup_program=startup_program)
144 145 146

    mov_fc = layers.fc(input=mov_emb,
                       size=32,
147 148
                       main_program=main_program,
                       startup_program=startup_program)
149 150 151 152 153 154 155

    CATEGORY_DICT_SIZE = len(paddle.dataset.movielens.movie_categories())

    category_id = layers.data(
        name='category_id',
        shape=[1],
        data_type='int64',
156 157
        main_program=main_program,
        startup_program=startup_program)
158 159 160 161 162

    mov_categories_emb = layers.embedding(
        input=category_id,
        size=[CATEGORY_DICT_SIZE, 32],
        is_sparse=is_sparse,
163 164
        main_program=main_program,
        startup_program=startup_program)
165 166 167 168

    mov_categories_hidden = layers.sequence_pool(
        input=mov_categories_emb,
        pool_type="sum",
169 170
        main_program=main_program,
        startup_program=startup_program)
171 172 173 174 175 176 177

    MOV_TITLE_DICT_SIZE = len(paddle.dataset.movielens.get_movie_title_dict())

    mov_title_id = layers.data(
        name='movie_title',
        shape=[1],
        data_type='int64',
178 179
        main_program=main_program,
        startup_program=startup_program)
180 181 182 183 184

    mov_title_emb = layers.embedding(
        input=mov_title_id,
        size=[MOV_TITLE_DICT_SIZE, 32],
        is_sparse=is_sparse,
185 186
        main_program=main_program,
        startup_program=startup_program)
187 188 189 190 191 192 193

    mov_title_conv = nets.sequence_conv_pool(
        input=mov_title_emb,
        num_filters=32,
        filter_size=3,
        act="tanh",
        pool_type="sum",
194 195
        main_program=main_program,
        startup_program=startup_program)
196 197 198 199

    concat_embed = layers.concat(
        input=[mov_fc, mov_categories_hidden, mov_title_conv],
        axis=1,
200 201
        main_program=main_program,
        startup_program=startup_program)
202 203 204 205 206

    # FIXME(dzh) : need tanh operator
    mov_combined_features = layers.fc(input=concat_embed,
                                      size=200,
                                      act="tanh",
207 208
                                      main_program=main_program,
                                      startup_program=startup_program)
209 210 211 212 213 214 215 216 217 218 219 220

    return mov_combined_features


def model():
    usr_combined_features = get_usr_combined_features()
    mov_combined_features = get_mov_combined_features()

    # need cos sim
    inference = layers.cos_sim(
        X=usr_combined_features,
        Y=mov_combined_features,
221 222
        main_program=main_program,
        startup_program=startup_program)
223 224 225 226 227

    label = layers.data(
        name='score',
        shape=[1],
        data_type='float32',
228 229
        main_program=main_program,
        startup_program=startup_program)
230 231 232 233

    square_cost = layers.square_error_cost(
        input=inference,
        label=label,
234 235
        main_program=main_program,
        startup_program=startup_program)
236 237

    avg_cost = layers.mean(
238 239 240
        x=square_cost,
        main_program=main_program,
        startup_program=startup_program)
241 242 243 244 245 246 247

    return avg_cost


def main():
    cost = model()
    sgd_optimizer = optimizer.SGDOptimizer(learning_rate=0.2)
248 249
    opts = sgd_optimizer.minimize(cost, startup_program=startup_program)
    block = main_program.block(0)
250 251 252 253 254 255 256

    if use_gpu:
        place = core.GPUPlace(0)
    else:
        place = core.CPUPlace()

    exe = Executor(place)
257
    exe.run(startup_program, feed={}, fetch_list=[])
258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305

    train_reader = paddle.batch(
        paddle.reader.shuffle(
            paddle.dataset.movielens.train(), buf_size=8192),
        batch_size=BATCH_SIZE)

    feeding = {
        'user_id': 0,
        'gender_id': 1,
        'age_id': 2,
        'job_id': 3,
        'movie_id': 4,
        'category_id': 5,
        'movie_title': 6,
        'score': 7
    }

    def func_feed(feeding, data):
        feed_tensors = {}
        for (key, idx) in feeding.iteritems():
            tensor = core.LoDTensor()
            if key != "category_id" and key != "movie_title":
                if key == "score":
                    numpy_data = np.array(map(lambda x: x[idx], data)).astype(
                        "float32")
                else:
                    numpy_data = np.array(map(lambda x: x[idx], data)).astype(
                        "int64")
            else:
                numpy_data = map(lambda x: np.array(x[idx]).astype("int64"),
                                 data)
                lod_info = [len(item) for item in numpy_data]
                offset = 0
                lod = [offset]
                for item in lod_info:
                    offset += item
                    lod.append(offset)
                numpy_data = np.concatenate(numpy_data, axis=0)
                tensor.set_lod([lod])

            numpy_data = numpy_data.reshape([numpy_data.shape[0], 1])
            tensor.set(numpy_data, place)
            feed_tensors[key] = tensor
        return feed_tensors

    PASS_NUM = 100
    for pass_id in range(PASS_NUM):
        for data in train_reader():
306
            outs = exe.run(main_program,
307 308 309
                           feed=func_feed(feeding, data),
                           fetch_list=[cost])
            out = np.array(outs[0])
310 311
            if out[0] < 6.0:
                # if avg cost less than 6.0, we think our code is good.
312 313 314 315
                exit(0)


main()