From 2eba5d90856987b1e32ae9f495e98d9279a44029 Mon Sep 17 00:00:00 2001 From: malin10 Date: Sat, 6 Jun 2020 00:09:08 +0800 Subject: [PATCH] add movie_recommand_demo --- core/trainers/single_infer.py | 17 +- demo/__init__.py | 13 ++ demo/movie_recommand/__init__.py | 13 ++ .../data/online_user/users.dat | 2 + demo/movie_recommand/data/process_ml_1m.py | 146 ++++++++++++++++++ demo/movie_recommand/data/split.py | 51 ++++++ demo/movie_recommand/data/test/log.data.hash | 0 demo/movie_recommand/data/train/log.data.hash | 0 demo/movie_recommand/data_prepare.sh | 18 +++ demo/movie_recommand/offline_test.sh | 12 ++ demo/movie_recommand/online_rank.sh | 8 + demo/movie_recommand/online_recall.sh | 9 ++ demo/movie_recommand/rank/__init__.py | 13 ++ demo/movie_recommand/rank/config.yaml | 93 +++++++++++ demo/movie_recommand/rank/model.py | 120 ++++++++++++++ demo/movie_recommand/recall/__init__.py | 13 ++ demo/movie_recommand/recall/config.yaml | 93 +++++++++++ demo/movie_recommand/recall/model.py | 100 ++++++++++++ demo/movie_recommand/train.sh | 8 + 19 files changed, 728 insertions(+), 1 deletion(-) create mode 100755 demo/__init__.py create mode 100755 demo/movie_recommand/__init__.py create mode 100644 demo/movie_recommand/data/online_user/users.dat create mode 100644 demo/movie_recommand/data/process_ml_1m.py create mode 100644 demo/movie_recommand/data/split.py create mode 100644 demo/movie_recommand/data/test/log.data.hash create mode 100644 demo/movie_recommand/data/train/log.data.hash create mode 100644 demo/movie_recommand/data_prepare.sh create mode 100644 demo/movie_recommand/offline_test.sh create mode 100644 demo/movie_recommand/online_rank.sh create mode 100644 demo/movie_recommand/online_recall.sh create mode 100755 demo/movie_recommand/rank/__init__.py create mode 100755 demo/movie_recommand/rank/config.yaml create mode 100755 demo/movie_recommand/rank/model.py create mode 100755 demo/movie_recommand/recall/__init__.py create mode 100755 demo/movie_recommand/recall/config.yaml create mode 100755 demo/movie_recommand/recall/model.py create mode 100644 demo/movie_recommand/train.sh diff --git a/core/trainers/single_infer.py b/core/trainers/single_infer.py index d54e418c..fcc92e2e 100755 --- a/core/trainers/single_infer.py +++ b/core/trainers/single_infer.py @@ -20,6 +20,8 @@ from __future__ import print_function import time import logging import os +import json +import numpy as np import paddle.fluid as fluid from paddlerec.core.trainers.transpiler_trainer import TranspileTrainer @@ -263,8 +265,10 @@ class SingleInfer(TranspileTrainer): envs.get_global_env("runner." + self._runner_name + ".print_interval", 20)) metrics_format.append("{}: {{}}".format("batch")) + metrics_indexes = dict() for name, var in metrics.items(): metrics_varnames.append(var.name) + metrics_indexes[var.name] = len(metrics_varnames) - 1 metrics_format.append("{}: {{}}".format(name)) metrics_format = ", ".join(metrics_format) @@ -272,19 +276,30 @@ class SingleInfer(TranspileTrainer): reader.start() batch_id = 0 scope = self._model[model_name][2] + + infer_results = [] with fluid.scope_guard(scope): try: while True: metrics_rets = self._exe.run(program=program, - fetch_list=metrics_varnames) + fetch_list=metrics_varnames, + return_numpy=False) metrics = [batch_id] metrics.extend(metrics_rets) + batch_infer_result = {} + for k, v in metrics_indexes.items(): + batch_infer_result[k] = np.array(metrics_rets[ + v]).tolist() + infer_results.append(batch_infer_result) + if batch_id % fetch_period == 0 and batch_id != 0: print(metrics_format.format(*metrics)) batch_id += 1 except fluid.core.EOFException: reader.reset() + with open(model_dict['save_path'], 'w') as fout: + json.dump(infer_results, fout) def terminal(self, context): context['is_exit'] = True diff --git a/demo/__init__.py b/demo/__init__.py new file mode 100755 index 00000000..abf198b9 --- /dev/null +++ b/demo/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/demo/movie_recommand/__init__.py b/demo/movie_recommand/__init__.py new file mode 100755 index 00000000..abf198b9 --- /dev/null +++ b/demo/movie_recommand/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/demo/movie_recommand/data/online_user/users.dat b/demo/movie_recommand/data/online_user/users.dat new file mode 100644 index 00000000..e9649b70 --- /dev/null +++ b/demo/movie_recommand/data/online_user/users.dat @@ -0,0 +1,2 @@ +2181::M::25::0 +2073::F::18::4 diff --git a/demo/movie_recommand/data/process_ml_1m.py b/demo/movie_recommand/data/process_ml_1m.py new file mode 100644 index 00000000..7125f625 --- /dev/null +++ b/demo/movie_recommand/data/process_ml_1m.py @@ -0,0 +1,146 @@ +#coding=utf8 +import sys +reload(sys) +sys.setdefaultencoding('utf-8') +import random +import json +user_fea = ["userid", "gender", "age", "occupation"] +movie_fea = ["movieid", "title", "genres"] +rating_fea = ["userid", "movieid", "rating", "time"] +dict_size = 60000000 +hash_dict = dict() + +data_path = "ml-1m" +test_user_path = "online_user" + + +def process(path): + user_dict = parse_data(data_path + "/users.dat", user_fea) + movie_dict = parse_movie_data(data_path + "/movies.dat", movie_fea) + + for line in open(path): + line = line.strip() + arr = line.split("::") + userid = arr[0] + movieid = arr[1] + out_str = "time:%s\t%s\t%s\tlabel:%s" % (arr[3], user_dict[userid], + movie_dict[movieid], arr[2]) + log_id = hash(out_str) % 1000000000 + print "%s\t%s" % (log_id, out_str) + + +def parse_data(file_name, feas): + dict = {} + for line in open(file_name): + line = line.strip() + arr = line.split("::") + out_str = "" + for i in range(0, len(feas)): + out_str += "%s:%s\t" % (feas[i], arr[i]) + + dict[arr[0]] = out_str.strip() + return dict + + +def parse_movie_data(file_name, feas): + dict = {} + for line in open(file_name): + line = line.strip() + arr = line.split("::") + title_str = "" + genres_str = "" + + for term in arr[1].split(" "): + term = term.strip() + if term != "": + title_str += "%s " % (term) + for term in arr[2].split("|"): + term = term.strip() + if term != "": + genres_str += "%s " % (term) + out_str = "movieid:%s\ttitle:%s\tgenres:%s" % ( + arr[0], title_str.strip(), genres_str.strip()) + dict[arr[0]] = out_str.strip() + return dict + + +def to_hash(in_str): + feas = in_str.split(":")[0] + arr = in_str.split(":")[1] + out_str = "%s:%s" % (feas, (arr + arr[::-1] + arr[::-2] + arr[::-3])) + hash_id = hash(out_str) % dict_size + if hash_id in hash_dict and hash_dict[hash_id] != out_str: + print(hash_id, out_str, hash(out_str)) + print("conflict") + exit(-1) + + return "%s:%s" % (feas, hash_id) + + +def to_hash_list(in_str): + arr = in_str.split(":") + tmp_arr = arr[1].split(" ") + out_str = "" + for item in tmp_arr: + item = item.strip() + if item != "": + key = "%s:%s" % (arr[0], item) + out_str += "%s " % (to_hash(key)) + return out_str.strip() + + +def get_hash(path): + #0-34831 1-time:974673057 2-userid:2021 3-gender:M 4-age:25 5-occupation:0 6-movieid:1345 7-title:Carrie (1976) 8-genres:Horror 9-label:2 + for line in open(path): + arr = line.strip().split("\t") + out_str = "logid:%s %s %s %s %s %s %s %s %s %s" % \ + (arr[0], arr[1], to_hash(arr[2]), to_hash(arr[3]), to_hash(arr[4]), to_hash(arr[5]), \ + to_hash(arr[6]), to_hash_list(arr[7]), to_hash_list(arr[8]), arr[9]) + print out_str + + +def generate_online_user(): + movie_dict = parse_movie_data(data_path + "/movies.dat", movie_fea) + + with open(test_user_path + "/movies.dat", 'w') as f: + for line in open(test_user_path + "/users.dat"): + line = line.strip() + arr = line.split("::") + userid = arr[0] + for item in movie_dict: + f.write(userid + "::" + item + "::1") + f.write("\n") + + +def generate_online_data(path): + user_dict = parse_data(data_path + "/users.dat", user_fea) + movie_dict = parse_movie_data(data_path + "/movies.dat", movie_fea) + + for line in open(path): + line = line.strip() + arr = line.split("::") + userid = arr[0] + movieid = arr[1] + label = arr[2] + out_str = "time:%s\t%s\t%s\tlabel:%s" % ("1", user_dict[userid], + movie_dict[movieid], label) + log_id = hash(out_str) % 1000000000 + res = "%s\t%s" % (log_id, out_str) + arr = res.strip().split("\t") + out_str = "logid:%s %s %s %s %s %s %s %s %s %s" % \ + (arr[0], arr[1], to_hash(arr[2]), to_hash(arr[3]), to_hash(arr[4]), to_hash(arr[5]), \ + to_hash(arr[6]), to_hash_list(arr[7]), to_hash_list(arr[8]), arr[9]) + print(out_str) + + +if __name__ == "__main__": + random.seed(1111111) + if sys.argv[1] == "process_raw": + process(sys.argv[2]) + elif sys.argv[1] == "hash": + get_hash(sys.argv[2]) + elif sys.argv[1] == "data_recall": + generate_online_user() + generate_online_data(test_user_path + "/movies.dat") + elif sys.argv[1] == "data_rank": + generate_online_data(test_user_path + "/movies.dat") diff --git a/demo/movie_recommand/data/split.py b/demo/movie_recommand/data/split.py new file mode 100644 index 00000000..9c0a7fd0 --- /dev/null +++ b/demo/movie_recommand/data/split.py @@ -0,0 +1,51 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import random + +train = dict() +test = dict() +data_path = "ml-1m" + +for line in open(data_path + "/ratings.dat"): + fea = line.rstrip().split("::") + if fea[0] not in train: + train[fea[0]] = [line] + elif fea[0] not in test: + test[fea[0]] = dict() + test[fea[0]]['time'] = int(fea[3]) + test[fea[0]]['content'] = line + else: + time = int(fea[3]) + if time <= test[fea[0]]['time']: + train[fea[0]].append(line) + else: + train[fea[0]].append(test[fea[0]]['content']) + test[fea[0]]['time'] = time + test[fea[0]]['content'] = line + +train_data = [] +for key in train: + for line in train[key]: + train_data.append(line) + +random.shuffle(train_data) + +with open(data_path + "/train.dat", 'w') as f: + for line in train_data: + f.write(line) + +with open(data_path + "/test.dat", 'w') as f: + for key in test: + f.write(test[key]['content']) diff --git a/demo/movie_recommand/data/test/log.data.hash b/demo/movie_recommand/data/test/log.data.hash new file mode 100644 index 00000000..e69de29b diff --git a/demo/movie_recommand/data/train/log.data.hash b/demo/movie_recommand/data/train/log.data.hash new file mode 100644 index 00000000..e69de29b diff --git a/demo/movie_recommand/data_prepare.sh b/demo/movie_recommand/data_prepare.sh new file mode 100644 index 00000000..b5dd3e07 --- /dev/null +++ b/demo/movie_recommand/data_prepare.sh @@ -0,0 +1,18 @@ +cd data + +wget http://files.grouplens.org/datasets/movielens/ml-1m.zip +unzip ml-1m.zip + +python split.py + +mkdir train/ +mkdir test/ + +python process_ml_1m.py process_raw ./ml-1m/train.dat | sort -t $'\t' -k 9 -n > log.data.train +python process_ml_1m.py process_raw ./ml-1m/test.dat | sort -t $'\t' -k 9 -n > log.data.test +python process_ml_1m.py hash log.data.train > ./train/log.data.hash +python process_ml_1m.py hash log.data.test > ./test/log.data.hash + +rm log.data.train +rm log.data.test +cd ../ diff --git a/demo/movie_recommand/offline_test.sh b/demo/movie_recommand/offline_test.sh new file mode 100644 index 00000000..f4b83c51 --- /dev/null +++ b/demo/movie_recommand/offline_test.sh @@ -0,0 +1,12 @@ +## modify config.yaml to infer mode at first + +cd recall +python -m paddlerec.run -m ./config.yaml +cd ../rank +python -m paddlerec.run -m ./config.yaml +cd .. + +echo "recall offline test result:" +python parse.py recall_offline recall/infer_result +echo "rank offline test result:" +python parse.py recall_offline rank/infer_result diff --git a/demo/movie_recommand/online_rank.sh b/demo/movie_recommand/online_rank.sh new file mode 100644 index 00000000..f2f5f167 --- /dev/null +++ b/demo/movie_recommand/online_rank.sh @@ -0,0 +1,8 @@ +cd data +python process_ml_1m.py data_rank > online_user/test/data.txt + +## modify recall/config.yaml to online_infer mode +cd ../rank +python -m paddlerec.run -m ./config.yaml +cd ../ +python parse.py rank_online rank/infer_result diff --git a/demo/movie_recommand/online_recall.sh b/demo/movie_recommand/online_recall.sh new file mode 100644 index 00000000..23fa7912 --- /dev/null +++ b/demo/movie_recommand/online_recall.sh @@ -0,0 +1,9 @@ +cd data +mkdir online_user/test +python process_ml_1m.py data_recall > online_user/test/data.txt + +## modify recall/config.yaml to online_infer mode +cd ../recall +python -m paddlerec.run -m ./config.yaml +cd ../ +python parse.py recall_online recall/infer_result diff --git a/demo/movie_recommand/rank/__init__.py b/demo/movie_recommand/rank/__init__.py new file mode 100755 index 00000000..abf198b9 --- /dev/null +++ b/demo/movie_recommand/rank/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/demo/movie_recommand/rank/config.yaml b/demo/movie_recommand/rank/config.yaml new file mode 100755 index 00000000..4bf1b325 --- /dev/null +++ b/demo/movie_recommand/rank/config.yaml @@ -0,0 +1,93 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +workspace: "demo/movie_recommand" + +# list of dataset +dataset: +- name: dataset_train # name of dataset to distinguish different datasets + batch_size: 128 + type: QueueDataset + data_path: "{workspace}/data/train" + sparse_slots: "logid time userid gender age occupation movieid title genres label" + dense_slots: "" +- name: dataset_infer # name + batch_size: 128 + type: DataLoader + data_path: "{workspace}/data/test" + sparse_slots: "logid time userid gender age occupation movieid title genres label" + dense_slots: "" +- name: dataset_online_infer # name + batch_size: 10 + type: DataLoader + data_path: "{workspace}/data/online_user/test" + sparse_slots: "logid time userid gender age occupation movieid title genres label" + dense_slots: "" + +# hyper parameters of user-defined network +hyper_parameters: + # optimizer config + optimizer: + class: Adam + learning_rate: 0.001 + strategy: async + # user-defined pairs + sparse_feature_number: 60000000 + sparse_feature_dim: 9 + dense_input_dim: 13 + fc_sizes: [512, 256, 128, 32] + +# train +mode: runner_train + +## online or offline infer +#mode: runner_infer +runner: +- name: runner_train + class: single_train + save_checkpoint_interval: 1 # save model interval of epochs + save_inference_interval: 1 # save inference + save_checkpoint_path: "increment" # save checkpoint path + save_inference_path: "inference" # save inference path + epochs: 10 + device: cpu + +- name: runner_infer + epochs: 1 + class: single_infer + print_interval: 10000 + init_model_path: "increment/9" # load model path + +#train +phase: +- name: phase1 + model: "{workspace}/model.py" # user-defined model + dataset_name: dataset_train # select dataset by name + thread_num: 12 + +##offline infer +#phase: +#- name: phase1 +# model: "{workspace}/model.py" # user-defined model +# dataset_name: dataset_infer # select dataset by name +# save_path: "./infer_result" +# thread_num: 1 + +##offline infer +#phase: +#- name: phase1 +# model: "{workspace}/model.py" # user-defined model +# dataset_name: dataset_online_infer # select dataset by name +# save_path: "./infer_result" +# thread_num: 1 diff --git a/demo/movie_recommand/rank/model.py b/demo/movie_recommand/rank/model.py new file mode 100755 index 00000000..2393e354 --- /dev/null +++ b/demo/movie_recommand/rank/model.py @@ -0,0 +1,120 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math + +import paddle.fluid as fluid + +from paddlerec.core.utils import envs +from paddlerec.core.model import Model as ModelBase + + +class Model(ModelBase): + def __init__(self, config): + ModelBase.__init__(self, config) + + def _init_hyper_parameters(self): + self.is_distributed = True if envs.get_trainer( + ) == "CtrTrainer" else False + self.sparse_feature_number = envs.get_global_env( + "hyper_parameters.sparse_feature_number") + self.sparse_feature_dim = envs.get_global_env( + "hyper_parameters.sparse_feature_dim") + self.learning_rate = envs.get_global_env( + "hyper_parameters.optimizer.learning_rate") + self.hidden_layers = envs.get_global_env("hyper_parameters.fc_sizes") + + def net(self, input, is_infer=False): + self.user_sparse_inputs = self._sparse_data_var[2:6] + self.mov_sparse_inputs = self._sparse_data_var[6:9] + + self.label_input = self._sparse_data_var[-1] + + def fc(input): + fcs = [input] + for size in self.hidden_layers: + output = fluid.layers.fc( + input=fcs[-1], + size=size, + act='relu', + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.Normal( + scale=1.0 / math.sqrt(fcs[-1].shape[1])))) + fcs.append(output) + return fcs[-1] + + def embedding_layer(input): + emb = fluid.layers.embedding( + input=input, + is_sparse=True, + is_distributed=self.is_distributed, + size=[self.sparse_feature_number, self.sparse_feature_dim], + param_attr=fluid.ParamAttr( + name="emb", initializer=fluid.initializer.Uniform()), ) + emb_sum = fluid.layers.sequence_pool(input=emb, pool_type='sum') + return emb_sum + + user_sparse_embed_seq = list( + map(embedding_layer, self.user_sparse_inputs)) + mov_sparse_embed_seq = list( + map(embedding_layer, self.mov_sparse_inputs)) + concated_user = fluid.layers.concat(user_sparse_embed_seq, axis=1) + concated_mov = fluid.layers.concat(mov_sparse_embed_seq, axis=1) + + usr_combined_features = fc(concated_user) + mov_combined_features = fc(concated_mov) + + fc_input = fluid.layers.concat( + [usr_combined_features, mov_combined_features], axis=1) + sim = fluid.layers.fc( + input=fc_input, + size=1, + act='sigmoid', + param_attr=fluid.ParamAttr(initializer=fluid.initializer.Normal( + scale=1.0 / math.sqrt(fc_input.shape[1])))) + + predict = fluid.layers.scale(sim, scale=5) + self.predict = predict + #auc, batch_auc, _ = fluid.layers.auc(input=self.predict, + # label=self.label_input, + # num_thresholds=10000, + # slide_steps=20) + + if is_infer: + self._infer_results["user_feature"] = usr_combined_features + self._infer_results["movie_feature"] = mov_combined_features + self._infer_results["uid"] = self._sparse_data_var[2] + self._infer_results["movieid"] = self._sparse_data_var[6] + self._infer_results["label"] = self._sparse_data_var[-1] + self._infer_results["predict"] = self.predict + return + + #self._metrics["AUC"] = auc + #self._metrics["BATCH_AUC"] = batch_auc + #cost = fluid.layers.cross_entropy( + # input=self.predict, label=self.label_input) + cost = fluid.layers.square_error_cost( + self.predict, + fluid.layers.cast( + x=self.label_input, dtype='float32')) + avg_cost = fluid.layers.reduce_mean(cost) + self._cost = avg_cost + self._metrics["LOSS"] = avg_cost + + def optimizer(self): + optimizer = fluid.optimizer.Adam(self.learning_rate, lazy_mode=True) + return optimizer + + def infer_net(self): + pass diff --git a/demo/movie_recommand/recall/__init__.py b/demo/movie_recommand/recall/__init__.py new file mode 100755 index 00000000..abf198b9 --- /dev/null +++ b/demo/movie_recommand/recall/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/demo/movie_recommand/recall/config.yaml b/demo/movie_recommand/recall/config.yaml new file mode 100755 index 00000000..056e2674 --- /dev/null +++ b/demo/movie_recommand/recall/config.yaml @@ -0,0 +1,93 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +workspace: "demo/movie_recommand" + +# list of dataset +dataset: +- name: dataset_train # name of dataset to distinguish different datasets + batch_size: 128 + type: QueueDataset + data_path: "{workspace}/data/train" + sparse_slots: "logid time userid gender age occupation movieid title genres label" + dense_slots: "" +- name: dataset_infer # name + batch_size: 128 + type: DataLoader + data_path: "{workspace}/data/test" + sparse_slots: "logid time userid gender age occupation movieid title genres label" + dense_slots: "" +- name: dataset_online_infer # name + batch_size: 128 + type: DataLoader + data_path: "{workspace}/data/online_user/test" + sparse_slots: "logid time userid gender age occupation movieid title genres label" + dense_slots: "" + +# hyper parameters of user-defined network +hyper_parameters: + # optimizer config + optimizer: + class: Adam + learning_rate: 0.001 + strategy: async + # user-defined pairs + sparse_feature_number: 60000000 + sparse_feature_dim: 9 + dense_input_dim: 13 + fc_sizes: [512, 256, 128, 32] + +# train +mode: runner_train + +## online or offline infer +#mode: runner_infer +runner: +- name: runner_train + class: single_train + save_checkpoint_interval: 1 # save model interval of epochs + save_inference_interval: 1 # save inference + save_checkpoint_path: "increment" # save checkpoint path + save_inference_path: "inference" # save inference path + epochs: 10 + device: cpu + +- name: runner_infer + epochs: 1 + class: single_infer + print_interval: 10000 + init_model_path: "increment/9" # load model path + +#train +phase: +- name: phase1 + model: "{workspace}/model.py" # user-defined model + dataset_name: dataset_train # select dataset by name + thread_num: 12 + +##offline infer +#phase: +#- name: phase1 +# model: "{workspace}/model.py" # user-defined model +# dataset_name: dataset_infer # select dataset by name +# save_path: "./infer_result" +# thread_num: 1 + +##offline infer +#phase: +#- name: phase1 +# model: "{workspace}/model.py" # user-defined model +# dataset_name: dataset_online_infer # select dataset by name +# save_path: "./infer_result" +# thread_num: 1 diff --git a/demo/movie_recommand/recall/model.py b/demo/movie_recommand/recall/model.py new file mode 100755 index 00000000..13773ef5 --- /dev/null +++ b/demo/movie_recommand/recall/model.py @@ -0,0 +1,100 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math + +import paddle.fluid as fluid + +from paddlerec.core.utils import envs +from paddlerec.core.model import Model as ModelBase + + +class Model(ModelBase): + def __init__(self, config): + ModelBase.__init__(self, config) + + def _init_hyper_parameters(self): + self.is_distributed = True if envs.get_trainer( + ) == "CtrTrainer" else False + self.sparse_feature_number = envs.get_global_env( + "hyper_parameters.sparse_feature_number") + self.sparse_feature_dim = envs.get_global_env( + "hyper_parameters.sparse_feature_dim") + self.learning_rate = envs.get_global_env( + "hyper_parameters.optimizer.learning_rate") + self.hidden_layers = envs.get_global_env("hyper_parameters.fc_sizes") + + def net(self, input, is_infer=False): + self.user_sparse_inputs = self._sparse_data_var[2:6] + self.mov_sparse_inputs = self._sparse_data_var[6:9] + + self.label_input = self._sparse_data_var[-1] + + def fc(input): + fcs = [input] + for size in self.hidden_layers: + output = fluid.layers.fc( + input=fcs[-1], + size=size, + act='relu', + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.Normal( + scale=1.0 / math.sqrt(fcs[-1].shape[1])))) + fcs.append(output) + return fcs[-1] + + def embedding_layer(input): + emb = fluid.layers.embedding( + input=input, + is_sparse=True, + is_distributed=self.is_distributed, + size=[self.sparse_feature_number, self.sparse_feature_dim], + param_attr=fluid.ParamAttr( + name="emb", initializer=fluid.initializer.Uniform()), ) + emb_sum = fluid.layers.sequence_pool(input=emb, pool_type='sum') + return emb_sum + + user_sparse_embed_seq = list( + map(embedding_layer, self.user_sparse_inputs)) + mov_sparse_embed_seq = list( + map(embedding_layer, self.mov_sparse_inputs)) + concated_user = fluid.layers.concat(user_sparse_embed_seq, axis=1) + concated_mov = fluid.layers.concat(mov_sparse_embed_seq, axis=1) + + usr_combined_features = fc(concated_user) + mov_combined_features = fc(concated_mov) + + sim = fluid.layers.cos_sim( + X=usr_combined_features, Y=mov_combined_features) + predict = fluid.layers.scale(sim, scale=5) + self.predict = predict + + if is_infer: + self._infer_results["uid"] = self._sparse_data_var[2] + self._infer_results["movieid"] = self._sparse_data_var[6] + self._infer_results["label"] = self._sparse_data_var[-1] + self._infer_results["predict"] = self.predict + return + + cost = fluid.layers.square_error_cost( + self.predict, + fluid.layers.cast( + x=self.label_input, dtype='float32')) + avg_cost = fluid.layers.reduce_mean(cost) + self._cost = avg_cost + self._metrics["LOSS"] = avg_cost + + def optimizer(self): + optimizer = fluid.optimizer.Adam(self.learning_rate, lazy_mode=True) + return optimizer diff --git a/demo/movie_recommand/train.sh b/demo/movie_recommand/train.sh new file mode 100644 index 00000000..4eb53f45 --- /dev/null +++ b/demo/movie_recommand/train.sh @@ -0,0 +1,8 @@ +cd recall +python -m paddlerec.run -m ./config.yaml +cd ../rank +python -m paddlerec.run -m ./config.yaml &> train_log & +cd .. + +echo "recall offline test: " +python infer_analys -- GitLab