test_recommender_system.py 6.2 KB
Newer Older
Q
Qiao Longfei 已提交
1
import numpy as np
2
import paddle.v2 as paddle
Q
Qiao Longfei 已提交
3
import paddle.v2.fluid.core as core
4
import paddle.v2.fluid.framework as framework
Q
Qiao Longfei 已提交
5 6
import paddle.v2.fluid.layers as layers
import paddle.v2.fluid.nets as nets
Q
Qiao Longfei 已提交
7
from paddle.v2.fluid.executor import Executor
Q
Qiao Longfei 已提交
8
from paddle.v2.fluid.optimizer import SGDOptimizer
9

10 11
IS_SPARSE = True
USE_GPU = False
12 13 14 15 16 17 18 19 20
BATCH_SIZE = 256


def get_usr_combined_features():
    # FIXME(dzh) : old API integer_value(10) may has range check.
    # currently we don't have user configurated check.

    USR_DICT_SIZE = paddle.dataset.movielens.max_user_id() + 1

F
fengjiayi 已提交
21
    uid = layers.data(name='user_id', shape=[1], dtype='int64')
22 23 24

    usr_emb = layers.embedding(
        input=uid,
F
fengjiayi 已提交
25
        dtype='float32',
26
        size=[USR_DICT_SIZE, 32],
Y
Yu Yang 已提交
27
        param_attr='user_table',
28
        is_sparse=IS_SPARSE)
29

Q
Qiao Longfei 已提交
30
    usr_fc = layers.fc(input=usr_emb, size=32)
31 32 33

    USR_GENDER_DICT_SIZE = 2

F
fengjiayi 已提交
34
    usr_gender_id = layers.data(name='gender_id', shape=[1], dtype='int64')
35 36 37 38

    usr_gender_emb = layers.embedding(
        input=usr_gender_id,
        size=[USR_GENDER_DICT_SIZE, 16],
Y
Yu Yang 已提交
39
        param_attr='gender_table',
40
        is_sparse=IS_SPARSE)
41

Q
Qiao Longfei 已提交
42
    usr_gender_fc = layers.fc(input=usr_gender_emb, size=16)
43 44

    USR_AGE_DICT_SIZE = len(paddle.dataset.movielens.age_table)
F
fengjiayi 已提交
45
    usr_age_id = layers.data(name='age_id', shape=[1], dtype="int64")
46 47 48 49

    usr_age_emb = layers.embedding(
        input=usr_age_id,
        size=[USR_AGE_DICT_SIZE, 16],
50
        is_sparse=IS_SPARSE,
Y
Yu Yang 已提交
51
        param_attr='age_table')
52

Q
Qiao Longfei 已提交
53
    usr_age_fc = layers.fc(input=usr_age_emb, size=16)
54 55

    USR_JOB_DICT_SIZE = paddle.dataset.movielens.max_job_id() + 1
F
fengjiayi 已提交
56
    usr_job_id = layers.data(name='job_id', shape=[1], dtype="int64")
57 58 59 60

    usr_job_emb = layers.embedding(
        input=usr_job_id,
        size=[USR_JOB_DICT_SIZE, 16],
Y
Yu Yang 已提交
61
        param_attr='job_table',
62
        is_sparse=IS_SPARSE)
63

Q
Qiao Longfei 已提交
64
    usr_job_fc = layers.fc(input=usr_job_emb, size=16)
65 66

    concat_embed = layers.concat(
Q
Qiao Longfei 已提交
67
        input=[usr_fc, usr_gender_fc, usr_age_fc, usr_job_fc], axis=1)
68

Q
Qiao Longfei 已提交
69
    usr_combined_features = layers.fc(input=concat_embed, size=200, act="tanh")
70 71 72 73 74 75 76 77

    return usr_combined_features


def get_mov_combined_features():

    MOV_DICT_SIZE = paddle.dataset.movielens.max_movie_id() + 1

F
fengjiayi 已提交
78
    mov_id = layers.data(name='movie_id', shape=[1], dtype='int64')
79 80 81

    mov_emb = layers.embedding(
        input=mov_id,
F
fengjiayi 已提交
82
        dtype='float32',
83
        size=[MOV_DICT_SIZE, 32],
Y
Yu Yang 已提交
84
        param_attr='movie_table',
85
        is_sparse=IS_SPARSE)
86

Q
Qiao Longfei 已提交
87
    mov_fc = layers.fc(input=mov_emb, size=32)
88 89 90

    CATEGORY_DICT_SIZE = len(paddle.dataset.movielens.movie_categories())

F
fengjiayi 已提交
91
    category_id = layers.data(name='category_id', shape=[1], dtype='int64')
92 93

    mov_categories_emb = layers.embedding(
Q
Qiao Longfei 已提交
94
        input=category_id, size=[CATEGORY_DICT_SIZE, 32], is_sparse=IS_SPARSE)
95 96

    mov_categories_hidden = layers.sequence_pool(
Q
Qiao Longfei 已提交
97
        input=mov_categories_emb, pool_type="sum")
98 99 100

    MOV_TITLE_DICT_SIZE = len(paddle.dataset.movielens.get_movie_title_dict())

F
fengjiayi 已提交
101
    mov_title_id = layers.data(name='movie_title', shape=[1], dtype='int64')
102 103

    mov_title_emb = layers.embedding(
Q
Qiao Longfei 已提交
104
        input=mov_title_id, size=[MOV_TITLE_DICT_SIZE, 32], is_sparse=IS_SPARSE)
105 106 107 108 109 110

    mov_title_conv = nets.sequence_conv_pool(
        input=mov_title_emb,
        num_filters=32,
        filter_size=3,
        act="tanh",
111
        pool_type="sum")
112 113

    concat_embed = layers.concat(
Q
Qiao Longfei 已提交
114
        input=[mov_fc, mov_categories_hidden, mov_title_conv], axis=1)
115 116

    # FIXME(dzh) : need tanh operator
Q
Qiao Longfei 已提交
117
    mov_combined_features = layers.fc(input=concat_embed, size=200, act="tanh")
118 119 120 121 122 123 124 125 126

    return mov_combined_features


def model():
    usr_combined_features = get_usr_combined_features()
    mov_combined_features = get_mov_combined_features()

    # need cos sim
Q
Qiao Longfei 已提交
127
    inference = layers.cos_sim(X=usr_combined_features, Y=mov_combined_features)
T
typhoonzero 已提交
128
    scale_infer = layers.scale(x=inference, scale=5.0)
129

F
fengjiayi 已提交
130
    label = layers.data(name='score', shape=[1], dtype='float32')
131

T
typhoonzero 已提交
132
    square_cost = layers.square_error_cost(input=scale_infer, label=label)
133

134
    avg_cost = layers.mean(x=square_cost)
135 136 137 138 139 140

    return avg_cost


def main():
    cost = model()
Q
Qiao Longfei 已提交
141
    sgd_optimizer = SGDOptimizer(learning_rate=0.2)
142
    opts = sgd_optimizer.minimize(cost)
143

144
    if USE_GPU:
D
dzhwinter 已提交
145
        place = core.CUDAPlace(0)
146 147 148 149
    else:
        place = core.CPUPlace()

    exe = Executor(place)
150
    exe.run(framework.default_startup_program())
151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198

    train_reader = paddle.batch(
        paddle.reader.shuffle(
            paddle.dataset.movielens.train(), buf_size=8192),
        batch_size=BATCH_SIZE)

    feeding = {
        'user_id': 0,
        'gender_id': 1,
        'age_id': 2,
        'job_id': 3,
        'movie_id': 4,
        'category_id': 5,
        'movie_title': 6,
        'score': 7
    }

    def func_feed(feeding, data):
        feed_tensors = {}
        for (key, idx) in feeding.iteritems():
            tensor = core.LoDTensor()
            if key != "category_id" and key != "movie_title":
                if key == "score":
                    numpy_data = np.array(map(lambda x: x[idx], data)).astype(
                        "float32")
                else:
                    numpy_data = np.array(map(lambda x: x[idx], data)).astype(
                        "int64")
            else:
                numpy_data = map(lambda x: np.array(x[idx]).astype("int64"),
                                 data)
                lod_info = [len(item) for item in numpy_data]
                offset = 0
                lod = [offset]
                for item in lod_info:
                    offset += item
                    lod.append(offset)
                numpy_data = np.concatenate(numpy_data, axis=0)
                tensor.set_lod([lod])

            numpy_data = numpy_data.reshape([numpy_data.shape[0], 1])
            tensor.set(numpy_data, place)
            feed_tensors[key] = tensor
        return feed_tensors

    PASS_NUM = 100
    for pass_id in range(PASS_NUM):
        for data in train_reader():
199
            outs = exe.run(framework.default_main_program(),
200 201 202
                           feed=func_feed(feeding, data),
                           fetch_list=[cost])
            out = np.array(outs[0])
203 204
            if out[0] < 6.0:
                # if avg cost less than 6.0, we think our code is good.
205 206 207 208
                exit(0)


main()