test_recommender_system.py 12.7 KB
Newer Older
1
#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
D
dzhwinter 已提交
2
#
D
dzhwinter 已提交
3 4 5
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
D
dzhwinter 已提交
6
#
D
dzhwinter 已提交
7
#     http://www.apache.org/licenses/LICENSE-2.0
D
dzhwinter 已提交
8
#
D
dzhwinter 已提交
9 10 11 12 13 14
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

15 16
from __future__ import print_function

17 18
import math
import sys
武毅 已提交
19
import os
Q
Qiao Longfei 已提交
20
import numpy as np
21
import paddle
22 23 24 25 26 27
import paddle.fluid as fluid
import paddle.fluid.framework as framework
import paddle.fluid.layers as layers
import paddle.fluid.nets as nets
from paddle.fluid.executor import Executor
from paddle.fluid.optimizer import SGDOptimizer
28

P
pangyoki 已提交
29 30
paddle.enable_static()

31 32
IS_SPARSE = True
USE_GPU = False
33 34 35 36 37 38 39 40 41
BATCH_SIZE = 256


def get_usr_combined_features():
    # FIXME(dzh) : old API integer_value(10) may has range check.
    # currently we don't have user configurated check.

    USR_DICT_SIZE = paddle.dataset.movielens.max_user_id() + 1

F
fengjiayi 已提交
42
    uid = layers.data(name='user_id', shape=[1], dtype='int64')
43

44 45 46 47 48
    usr_emb = layers.embedding(input=uid,
                               dtype='float32',
                               size=[USR_DICT_SIZE, 32],
                               param_attr='user_table',
                               is_sparse=IS_SPARSE)
49

Q
Qiao Longfei 已提交
50
    usr_fc = layers.fc(input=usr_emb, size=32)
51 52 53

    USR_GENDER_DICT_SIZE = 2

F
fengjiayi 已提交
54
    usr_gender_id = layers.data(name='gender_id', shape=[1], dtype='int64')
55

56 57 58 59
    usr_gender_emb = layers.embedding(input=usr_gender_id,
                                      size=[USR_GENDER_DICT_SIZE, 16],
                                      param_attr='gender_table',
                                      is_sparse=IS_SPARSE)
60

Q
Qiao Longfei 已提交
61
    usr_gender_fc = layers.fc(input=usr_gender_emb, size=16)
62 63

    USR_AGE_DICT_SIZE = len(paddle.dataset.movielens.age_table)
F
fengjiayi 已提交
64
    usr_age_id = layers.data(name='age_id', shape=[1], dtype="int64")
65

66 67 68 69
    usr_age_emb = layers.embedding(input=usr_age_id,
                                   size=[USR_AGE_DICT_SIZE, 16],
                                   is_sparse=IS_SPARSE,
                                   param_attr='age_table')
70

Q
Qiao Longfei 已提交
71
    usr_age_fc = layers.fc(input=usr_age_emb, size=16)
72 73

    USR_JOB_DICT_SIZE = paddle.dataset.movielens.max_job_id() + 1
F
fengjiayi 已提交
74
    usr_job_id = layers.data(name='job_id', shape=[1], dtype="int64")
75

76 77 78 79
    usr_job_emb = layers.embedding(input=usr_job_id,
                                   size=[USR_JOB_DICT_SIZE, 16],
                                   param_attr='job_table',
                                   is_sparse=IS_SPARSE)
80

Q
Qiao Longfei 已提交
81
    usr_job_fc = layers.fc(input=usr_job_emb, size=16)
82 83

    concat_embed = layers.concat(
Q
Qiao Longfei 已提交
84
        input=[usr_fc, usr_gender_fc, usr_age_fc, usr_job_fc], axis=1)
85

Q
Qiao Longfei 已提交
86
    usr_combined_features = layers.fc(input=concat_embed, size=200, act="tanh")
87 88 89 90 91 92 93 94

    return usr_combined_features


def get_mov_combined_features():

    MOV_DICT_SIZE = paddle.dataset.movielens.max_movie_id() + 1

F
fengjiayi 已提交
95
    mov_id = layers.data(name='movie_id', shape=[1], dtype='int64')
96

97 98 99 100 101
    mov_emb = layers.embedding(input=mov_id,
                               dtype='float32',
                               size=[MOV_DICT_SIZE, 32],
                               param_attr='movie_table',
                               is_sparse=IS_SPARSE)
102

Q
Qiao Longfei 已提交
103
    mov_fc = layers.fc(input=mov_emb, size=32)
104 105 106

    CATEGORY_DICT_SIZE = len(paddle.dataset.movielens.movie_categories())

107 108 109 110
    category_id = layers.data(name='category_id',
                              shape=[1],
                              dtype='int64',
                              lod_level=1)
111

112 113 114
    mov_categories_emb = layers.embedding(input=category_id,
                                          size=[CATEGORY_DICT_SIZE, 32],
                                          is_sparse=IS_SPARSE)
115

116 117
    mov_categories_hidden = layers.sequence_pool(input=mov_categories_emb,
                                                 pool_type="sum")
118 119 120

    MOV_TITLE_DICT_SIZE = len(paddle.dataset.movielens.get_movie_title_dict())

121 122 123 124
    mov_title_id = layers.data(name='movie_title',
                               shape=[1],
                               dtype='int64',
                               lod_level=1)
125

126 127 128
    mov_title_emb = layers.embedding(input=mov_title_id,
                                     size=[MOV_TITLE_DICT_SIZE, 32],
                                     is_sparse=IS_SPARSE)
129

130 131 132 133 134
    mov_title_conv = nets.sequence_conv_pool(input=mov_title_emb,
                                             num_filters=32,
                                             filter_size=3,
                                             act="tanh",
                                             pool_type="sum")
135 136

    concat_embed = layers.concat(
Q
Qiao Longfei 已提交
137
        input=[mov_fc, mov_categories_hidden, mov_title_conv], axis=1)
138 139

    # FIXME(dzh) : need tanh operator
Q
Qiao Longfei 已提交
140
    mov_combined_features = layers.fc(input=concat_embed, size=200, act="tanh")
141 142 143 144 145 146 147 148 149

    return mov_combined_features


def model():
    usr_combined_features = get_usr_combined_features()
    mov_combined_features = get_mov_combined_features()

    # need cos sim
Q
Qiao Longfei 已提交
150
    inference = layers.cos_sim(X=usr_combined_features, Y=mov_combined_features)
T
typhoonzero 已提交
151
    scale_infer = layers.scale(x=inference, scale=5.0)
152

F
fengjiayi 已提交
153
    label = layers.data(name='score', shape=[1], dtype='float32')
T
typhoonzero 已提交
154
    square_cost = layers.square_error_cost(input=scale_infer, label=label)
Y
Yu Yang 已提交
155
    avg_cost = layers.mean(square_cost)
156

157 158
    return scale_infer, avg_cost

159

武毅 已提交
160
def train(use_cuda, save_dirname, is_local=True):
161 162 163
    scale_infer, avg_cost = model()

    # test program
164
    test_program = fluid.default_main_program().clone(for_test=True)
165

Q
Qiao Longfei 已提交
166
    sgd_optimizer = SGDOptimizer(learning_rate=0.2)
W
Wu Yi 已提交
167
    sgd_optimizer.minimize(avg_cost)
168

169
    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
170 171 172

    exe = Executor(place)

173 174 175 176 177
    train_reader = paddle.batch(paddle.reader.shuffle(
        paddle.dataset.movielens.train(), buf_size=8192),
                                batch_size=BATCH_SIZE)
    test_reader = paddle.batch(paddle.dataset.movielens.test(),
                               batch_size=BATCH_SIZE)
178

179 180 181 182
    feed_order = [
        'user_id', 'gender_id', 'age_id', 'job_id', 'movie_id', 'category_id',
        'movie_title', 'score'
    ]
183

武毅 已提交
184 185 186
    def train_loop(main_program):
        exe.run(framework.default_startup_program())

187 188 189 190 191
        feed_list = [
            main_program.global_block().var(var_name) for var_name in feed_order
        ]
        feeder = fluid.DataFeeder(feed_list, place)

武毅 已提交
192 193 194 195 196
        PASS_NUM = 100
        for pass_id in range(PASS_NUM):
            for batch_id, data in enumerate(train_reader()):
                # train a mini-batch
                outs = exe.run(program=main_program,
197
                               feed=feeder.feed(data),
武毅 已提交
198 199 200 201 202
                               fetch_list=[avg_cost])
                out = np.array(outs[0])
                if (batch_id + 1) % 10 == 0:
                    avg_cost_set = []
                    for test_data in test_reader():
203 204 205
                        avg_cost_np = exe.run(program=test_program,
                                              feed=feeder.feed(test_data),
                                              fetch_list=[avg_cost])
武毅 已提交
206 207 208 209 210 211 212 213
                        avg_cost_set.append(avg_cost_np[0])
                        break  # test only 1 segment for speeding up CI

                    # get test avg_cost
                    test_avg_cost = np.array(avg_cost_set).mean()
                    if test_avg_cost < 6.0:
                        # if avg_cost less than 6.0, we think our code is good.
                        if save_dirname is not None:
214 215 216 217 218
                            fluid.io.save_inference_model(
                                save_dirname, [
                                    "user_id", "gender_id", "age_id", "job_id",
                                    "movie_id", "category_id", "movie_title"
                                ], [scale_infer], exe)
武毅 已提交
219 220 221 222 223 224 225 226
                        return

                if math.isnan(float(out[0])):
                    sys.exit("got NaN loss, training failed.")

    if is_local:
        train_loop(fluid.default_main_program())
    else:
G
gongweibao 已提交
227 228
        port = os.getenv("PADDLE_PSERVER_PORT", "6174")
        pserver_ips = os.getenv("PADDLE_PSERVER_IPS")  # ip,ip...
武毅 已提交
229 230 231 232
        eplist = []
        for ip in pserver_ips.split(","):
            eplist.append(':'.join([ip, port]))
        pserver_endpoints = ",".join(eplist)  # ip:port,ip:port...
G
gongweibao 已提交
233
        trainers = int(os.getenv("PADDLE_TRAINERS"))
武毅 已提交
234
        current_endpoint = os.getenv("POD_IP") + ":" + port
G
gongweibao 已提交
235 236
        trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
        training_role = os.getenv("PADDLE_TRAINING_ROLE", "TRAINER")
武毅 已提交
237
        t = fluid.DistributeTranspiler()
Y
Yancey1989 已提交
238
        t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers)
武毅 已提交
239 240 241 242 243 244 245 246
        if training_role == "PSERVER":
            pserver_prog = t.get_pserver_program(current_endpoint)
            pserver_startup = t.get_startup_program(current_endpoint,
                                                    pserver_prog)
            exe.run(pserver_startup)
            exe.run(pserver_prog)
        elif training_role == "TRAINER":
            train_loop(t.get_trainer_program())
247 248


249 250 251 252 253 254 255
def infer(use_cuda, save_dirname=None):
    if save_dirname is None:
        return

    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
    exe = fluid.Executor(place)

256 257 258
    inference_scope = fluid.core.Scope()
    with fluid.scope_guard(inference_scope):
        # Use fluid.io.load_inference_model to obtain the inference program desc,
T
tianshuo78520a 已提交
259
        # the feed_target_names (the names of variables that will be fed
260 261 262 263 264 265 266
        # data using feed operators), and the fetch_targets (variables that
        # we want to obtain data from using fetch operators).
        [inference_program, feed_target_names,
         fetch_targets] = fluid.io.load_inference_model(save_dirname, exe)

        # Use the first data from paddle.dataset.movielens.test() as input
        assert feed_target_names[0] == "user_id"
267 268 269
        # Use create_lod_tensor(data, recursive_sequence_lengths, place) API
        # to generate LoD Tensor where `data` is a list of sequences of index
        # numbers, `recursive_sequence_lengths` is the length-based level of detail
270
        # (lod) info associated with `data`.
271 272
        # For example, data = [[10, 2, 3], [2, 3]] means that it contains
        # two sequences of indexes, of length 3 and 2, respectively.
273 274 275
        # Correspondingly, recursive_sequence_lengths = [[3, 2]] contains one
        # level of detail info, indicating that `data` consists of two sequences
        # of length 3 and 2, respectively.
P
peizhilin 已提交
276
        user_id = fluid.create_lod_tensor([[np.int64(1)]], [[1]], place)
277 278

        assert feed_target_names[1] == "gender_id"
P
peizhilin 已提交
279
        gender_id = fluid.create_lod_tensor([[np.int64(1)]], [[1]], place)
280 281

        assert feed_target_names[2] == "age_id"
P
peizhilin 已提交
282
        age_id = fluid.create_lod_tensor([[np.int64(0)]], [[1]], place)
283 284

        assert feed_target_names[3] == "job_id"
P
peizhilin 已提交
285
        job_id = fluid.create_lod_tensor([[np.int64(10)]], [[1]], place)
286 287

        assert feed_target_names[4] == "movie_id"
P
peizhilin 已提交
288
        movie_id = fluid.create_lod_tensor([[np.int64(783)]], [[1]], place)
289 290

        assert feed_target_names[5] == "category_id"
P
peizhilin 已提交
291
        category_id = fluid.create_lod_tensor(
292
            [np.array([10, 8, 9], dtype='int64')], [[3]], place)
293 294

        assert feed_target_names[6] == "movie_title"
P
peizhilin 已提交
295
        movie_title = fluid.create_lod_tensor(
296
            [np.array([1069, 4140, 2923, 710, 988], dtype='int64')], [[5]],
P
peizhilin 已提交
297
            place)
298 299 300 301 302 303 304 305 306 307 308 309 310 311 312

        # Construct feed as a dictionary of {feed_target_name: feed_target_data}
        # and results will contain a list of data corresponding to fetch_targets.
        results = exe.run(inference_program,
                          feed={
                              feed_target_names[0]: user_id,
                              feed_target_names[1]: gender_id,
                              feed_target_names[2]: age_id,
                              feed_target_names[3]: job_id,
                              feed_target_names[4]: movie_id,
                              feed_target_names[5]: category_id,
                              feed_target_names[6]: movie_title
                          },
                          fetch_list=fetch_targets,
                          return_numpy=False)
313
        print("inferred score: ", np.array(results[0]))
314 315 316 317 318 319 320 321 322 323 324 325 326 327 328


def main(use_cuda):
    if use_cuda and not fluid.core.is_compiled_with_cuda():
        return

    # Directory for saving the inference model
    save_dirname = "recommender_system.inference.model"

    train(use_cuda, save_dirname)
    infer(use_cuda, save_dirname)


if __name__ == '__main__':
    main(USE_GPU)