train.py 12.3 KB
Newer Older
N
Nicky 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14
#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

15
from __future__ import print_function
N
Nicky 已提交
16 17
import math
import sys
u010070587's avatar
u010070587 已提交
18
import argparse
N
Nicky 已提交
19 20 21 22 23 24 25 26
import numpy as np
import paddle
import paddle.fluid as fluid
import paddle.fluid.layers as layers
import paddle.fluid.nets as nets

IS_SPARSE = True
BATCH_SIZE = 256
u010070587's avatar
u010070587 已提交
27 28 29 30 31 32 33 34 35 36 37 38 39 40


def parse_args():
    parser = argparse.ArgumentParser("recommender_system")
    parser.add_argument(
        '--enable_ce',
        action='store_true',
        help="If set, run the task with continuous evaluation logs.")
    parser.add_argument(
        '--use_gpu', type=int, default=0, help="Whether to use GPU or not.")
    parser.add_argument(
        '--num_epochs', type=int, default=1, help="number of epochs.")
    args = parser.parse_args()
    return args
Y
Yu Yang 已提交
41

H
Helin Wang 已提交
42

Q
qijun 已提交
43
def get_usr_combined_features():
N
Nicky 已提交
44 45 46

    USR_DICT_SIZE = paddle.dataset.movielens.max_user_id() + 1

H
hutuxian 已提交
47
    uid = fluid.data(name='user_id', shape=[-1], dtype='int64')
N
Nicky 已提交
48

H
hutuxian 已提交
49
    usr_emb = fluid.embedding(
N
Nicky 已提交
50 51 52 53 54 55 56 57 58 59
        input=uid,
        dtype='float32',
        size=[USR_DICT_SIZE, 32],
        param_attr='user_table',
        is_sparse=IS_SPARSE)

    usr_fc = layers.fc(input=usr_emb, size=32)

    USR_GENDER_DICT_SIZE = 2

H
hutuxian 已提交
60
    usr_gender_id = fluid.data(name='gender_id', shape=[-1], dtype='int64')
N
Nicky 已提交
61

H
hutuxian 已提交
62
    usr_gender_emb = fluid.embedding(
N
Nicky 已提交
63 64 65 66 67 68 69 70
        input=usr_gender_id,
        size=[USR_GENDER_DICT_SIZE, 16],
        param_attr='gender_table',
        is_sparse=IS_SPARSE)

    usr_gender_fc = layers.fc(input=usr_gender_emb, size=16)

    USR_AGE_DICT_SIZE = len(paddle.dataset.movielens.age_table)
H
hutuxian 已提交
71
    usr_age_id = fluid.data(name='age_id', shape=[-1], dtype="int64")
N
Nicky 已提交
72

H
hutuxian 已提交
73
    usr_age_emb = fluid.embedding(
N
Nicky 已提交
74 75 76 77 78 79 80 81
        input=usr_age_id,
        size=[USR_AGE_DICT_SIZE, 16],
        is_sparse=IS_SPARSE,
        param_attr='age_table')

    usr_age_fc = layers.fc(input=usr_age_emb, size=16)

    USR_JOB_DICT_SIZE = paddle.dataset.movielens.max_job_id() + 1
H
hutuxian 已提交
82
    usr_job_id = fluid.data(name='job_id', shape=[-1], dtype="int64")
N
Nicky 已提交
83

H
hutuxian 已提交
84
    usr_job_emb = fluid.embedding(
N
Nicky 已提交
85 86 87 88 89 90 91 92 93 94 95 96
        input=usr_job_id,
        size=[USR_JOB_DICT_SIZE, 16],
        param_attr='job_table',
        is_sparse=IS_SPARSE)

    usr_job_fc = layers.fc(input=usr_job_emb, size=16)

    concat_embed = layers.concat(
        input=[usr_fc, usr_gender_fc, usr_age_fc, usr_job_fc], axis=1)

    usr_combined_features = layers.fc(input=concat_embed, size=200, act="tanh")

Q
qijun 已提交
97
    return usr_combined_features
Y
Yu Yang 已提交
98

Q
qijun 已提交
99

Q
qijun 已提交
100
def get_mov_combined_features():
N
Nicky 已提交
101 102 103

    MOV_DICT_SIZE = paddle.dataset.movielens.max_movie_id() + 1

H
hutuxian 已提交
104
    mov_id = fluid.data(name='movie_id', shape=[-1], dtype='int64')
N
Nicky 已提交
105

H
hutuxian 已提交
106
    mov_emb = fluid.embedding(
N
Nicky 已提交
107 108 109 110 111 112 113 114 115 116
        input=mov_id,
        dtype='float32',
        size=[MOV_DICT_SIZE, 32],
        param_attr='movie_table',
        is_sparse=IS_SPARSE)

    mov_fc = layers.fc(input=mov_emb, size=32)

    CATEGORY_DICT_SIZE = len(paddle.dataset.movielens.movie_categories())

H
hutuxian 已提交
117 118
    category_id = fluid.data(
        name='category_id', shape=[-1], dtype='int64', lod_level=1)
N
Nicky 已提交
119

H
hutuxian 已提交
120
    mov_categories_emb = fluid.embedding(
N
Nicky 已提交
121 122 123 124 125 126 127
        input=category_id, size=[CATEGORY_DICT_SIZE, 32], is_sparse=IS_SPARSE)

    mov_categories_hidden = layers.sequence_pool(
        input=mov_categories_emb, pool_type="sum")

    MOV_TITLE_DICT_SIZE = len(paddle.dataset.movielens.get_movie_title_dict())

H
hutuxian 已提交
128 129
    mov_title_id = fluid.data(
        name='movie_title', shape=[-1], dtype='int64', lod_level=1)
N
Nicky 已提交
130

H
hutuxian 已提交
131
    mov_title_emb = fluid.embedding(
N
Nicky 已提交
132 133 134 135 136 137 138 139 140 141 142 143 144 145
        input=mov_title_id, size=[MOV_TITLE_DICT_SIZE, 32], is_sparse=IS_SPARSE)

    mov_title_conv = nets.sequence_conv_pool(
        input=mov_title_emb,
        num_filters=32,
        filter_size=3,
        act="tanh",
        pool_type="sum")

    concat_embed = layers.concat(
        input=[mov_fc, mov_categories_hidden, mov_title_conv], axis=1)

    mov_combined_features = layers.fc(input=concat_embed, size=200, act="tanh")

Q
qijun 已提交
146
    return mov_combined_features
Q
qijun 已提交
147

Y
Yu Yang 已提交
148

N
Nicky 已提交
149
def inference_program():
Q
qijun 已提交
150 151
    usr_combined_features = get_usr_combined_features()
    mov_combined_features = get_mov_combined_features()
Y
Yu Yang 已提交
152

N
Nicky 已提交
153 154
    inference = layers.cos_sim(X=usr_combined_features, Y=mov_combined_features)
    scale_infer = layers.scale(x=inference, scale=5.0)
Y
Yu Yang 已提交
155

H
hutuxian 已提交
156
    label = fluid.data(name='score', shape=[-1, 1], dtype='float32')
N
Nicky 已提交
157 158
    square_cost = layers.square_error_cost(input=scale_infer, label=label)
    avg_cost = layers.mean(square_cost)
Y
Yu Yang 已提交
159

160
    return scale_infer, avg_cost
N
Nicky 已提交
161 162 163 164 165 166


def optimizer_func():
    return fluid.optimizer.SGD(learning_rate=0.2)


167
def train(use_cuda, params_dirname):
N
Nicky 已提交
168 169
    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()

u010070587's avatar
u010070587 已提交
170
    if args.enable_ce:
H
hutuxian 已提交
171
        train_reader = fluid.io.batch(
u010070587's avatar
u010070587 已提交
172
            paddle.dataset.movielens.train(), batch_size=BATCH_SIZE)
H
hutuxian 已提交
173
        test_reader = fluid.io.batch(
u010070587's avatar
u010070587 已提交
174 175
            paddle.dataset.movielens.test(), batch_size=BATCH_SIZE)
    else:
H
hutuxian 已提交
176 177
        train_reader = fluid.io.batch(
            fluid.io.shuffle(paddle.dataset.movielens.train(), buf_size=8192),
u010070587's avatar
u010070587 已提交
178
            batch_size=BATCH_SIZE)
H
hutuxian 已提交
179
        test_reader = fluid.io.batch(
u010070587's avatar
u010070587 已提交
180
            paddle.dataset.movielens.test(), batch_size=BATCH_SIZE)
N
Nicky 已提交
181 182 183 184 185 186

    feed_order = [
        'user_id', 'gender_id', 'age_id', 'job_id', 'movie_id', 'category_id',
        'movie_title', 'score'
    ]

187 188
    main_program = fluid.default_main_program()
    star_program = fluid.default_startup_program()
u010070587's avatar
u010070587 已提交
189 190 191 192
    if args.enable_ce:
        main_program.random_seed = 90
        star_program.random_seed = 90

193 194 195 196 197 198 199 200 201 202 203 204 205 206
    scale_infer, avg_cost = inference_program()

    test_program = main_program.clone(for_test=True)
    sgd_optimizer = optimizer_func()
    sgd_optimizer.minimize(avg_cost)
    exe = fluid.Executor(place)

    def train_test(program, reader):
        count = 0
        feed_var_list = [
            program.global_block().var(var_name) for var_name in feed_order
        ]
        feeder_test = fluid.DataFeeder(feed_list=feed_var_list, place=place)
        test_exe = fluid.Executor(place)
207
        accumulated = 0
208 209 210 211
        for test_data in reader():
            avg_cost_np = test_exe.run(
                program=program,
                feed=feeder_test.feed(test_data),
212 213
                fetch_list=[avg_cost])
            accumulated += avg_cost_np[0]
214
            count += 1
215
        return accumulated / count
216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233

    def train_loop():
        feed_list = [
            main_program.global_block().var(var_name) for var_name in feed_order
        ]
        feeder = fluid.DataFeeder(feed_list, place)
        exe.run(star_program)

        for pass_id in range(PASS_NUM):
            for batch_id, data in enumerate(train_reader()):
                # train a mini-batch
                outs = exe.run(
                    program=main_program,
                    feed=feeder.feed(data),
                    fetch_list=[avg_cost])
                out = np.array(outs[0])

                # get test avg_cost
234
                test_avg_cost = train_test(test_program, test_reader)
235

236 237
                # if test_avg_cost < 4.0: # Change this number to adjust accuracy
                if batch_id == 20:
u010070587's avatar
u010070587 已提交
238 239 240 241

                    if args.enable_ce:
                        print("kpis\ttest_cost\t%f" % float(test_avg_cost))

242 243 244 245 246 247
                    if params_dirname is not None:
                        fluid.io.save_inference_model(params_dirname, [
                            "user_id", "gender_id", "age_id", "job_id",
                            "movie_id", "category_id", "movie_title"
                        ], [scale_infer], exe)
                    return
248 249
                print('EpochID {0}, BatchID {1}, Test Loss {2:0.2}'.format(
                    pass_id + 1, batch_id + 1, float(test_avg_cost)))
250 251

                if math.isnan(float(out[0])):
N
Nicky 已提交
252 253
                    sys.exit("got NaN loss, training failed.")

254
    train_loop()
N
Nicky 已提交
255 256


257
def infer(use_cuda, params_dirname):
N
Nicky 已提交
258 259 260 261 262 263 264 265 266 267
    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()

    # Use the first data from paddle.dataset.movielens.test() as input.
    # Use create_lod_tensor(data, lod, place) API to generate LoD Tensor,
    # where `data` is a list of sequences of index numbers, `lod` is
    # the level of detail (lod) info associated with `data`.
    # For example, data = [[10, 2, 3], [2, 3]] means that it contains
    # two sequences of indexes, of length 3 and 2, respectively.
    # Correspondingly, lod = [[3, 2]] contains one level of detail info,
    # indicating that `data` consists of two sequences of length 3 and 2.
J
JiabinYang 已提交
268 269 270
    infer_movie_id = 783
    infer_movie_name = paddle.dataset.movielens.movie_info()[
        infer_movie_id].title
271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294

    exe = fluid.Executor(place)

    inference_scope = fluid.core.Scope()

    with fluid.scope_guard(inference_scope):
        # Use fluid.io.load_inference_model to obtain the inference program desc,
        # the feed_target_names (the names of variables that will be feeded
        # data using feed operators), and the fetch_targets (variables that
        # we want to obtain data from using fetch operators).
        [inferencer, feed_target_names,
         fetch_targets] = fluid.io.load_inference_model(params_dirname, exe)

        # Use the first data from paddle.dataset.movielens.test() as input
        assert feed_target_names[0] == "user_id"
        # Use create_lod_tensor(data, recursive_sequence_lengths, place) API
        # to generate LoD Tensor where `data` is a list of sequences of index
        # numbers, `recursive_sequence_lengths` is the length-based level of detail
        # (lod) info associated with `data`.
        # For example, data = [[10, 2, 3], [2, 3]] means that it contains
        # two sequences of indexes, of length 3 and 2, respectively.
        # Correspondingly, recursive_sequence_lengths = [[3, 2]] contains one
        # level of detail info, indicating that `data` consists of two sequences
        # of length 3 and 2, respectively.
H
hutuxian 已提交
295
        user_id = np.array([1]).astype("int64").reshape(-1)
296 297

        assert feed_target_names[1] == "gender_id"
H
hutuxian 已提交
298
        gender_id = np.array([1]).astype("int64").reshape(-1)
299 300

        assert feed_target_names[2] == "age_id"
H
hutuxian 已提交
301
        age_id = np.array([0]).astype("int64").reshape(-1)
302 303

        assert feed_target_names[3] == "job_id"
H
hutuxian 已提交
304
        job_id = np.array([10]).astype("int64").reshape(-1)
305 306

        assert feed_target_names[4] == "movie_id"
H
hutuxian 已提交
307
        movie_id = np.array([783]).astype("int64").reshape(-1)
308 309

        assert feed_target_names[5] == "category_id"
P
peizhilin 已提交
310
        category_id = fluid.create_lod_tensor(
H
hutuxian 已提交
311
            np.array([10, 8, 9], dtype='int64'), [[3]], place)
312 313

        assert feed_target_names[6] == "movie_title"
P
peizhilin 已提交
314
        movie_title = fluid.create_lod_tensor(
H
hutuxian 已提交
315
            np.array([1069, 4140, 2923, 710, 988], dtype='int64'), [[5]], place)
316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336

        # Construct feed as a dictionary of {feed_target_name: feed_target_data}
        # and results will contain a list of data corresponding to fetch_targets.
        results = exe.run(
            inferencer,
            feed={
                feed_target_names[0]: user_id,
                feed_target_names[1]: gender_id,
                feed_target_names[2]: age_id,
                feed_target_names[3]: job_id,
                feed_target_names[4]: movie_id,
                feed_target_names[5]: category_id,
                feed_target_names[6]: movie_title
            },
            fetch_list=fetch_targets,
            return_numpy=False)
        predict_rating = np.array(results[0])
        print("Predict Rating of user id 1 on movie \"" + infer_movie_name +
              "\" is " + str(predict_rating[0][0]))
        print("Actual Rating of user id 1 on movie \"" + infer_movie_name +
              "\" is 4.")
N
Nicky 已提交
337 338 339 340 341 342


def main(use_cuda):
    if use_cuda and not fluid.core.is_compiled_with_cuda():
        return
    params_dirname = "recommender_system.inference.model"
343 344
    train(use_cuda=use_cuda, params_dirname=params_dirname)
    infer(use_cuda=use_cuda, params_dirname=params_dirname)
Y
Yu Yang 已提交
345 346 347


if __name__ == '__main__':
u010070587's avatar
u010070587 已提交
348 349 350 351
    args = parse_args()
    PASS_NUM = args.num_epochs
    use_cuda = args.use_gpu
    main(use_cuda)