diff --git a/core/trainers/single_infer.py b/core/trainers/single_infer.py index d54e418c2a36d96f94ec39e53fc11c19e43d3f06..fcc92e2e01cf7af49d6d124a7a5773ac9946c989 100755 --- a/core/trainers/single_infer.py +++ b/core/trainers/single_infer.py @@ -20,6 +20,8 @@ from __future__ import print_function import time import logging import os +import json +import numpy as np import paddle.fluid as fluid from paddlerec.core.trainers.transpiler_trainer import TranspileTrainer @@ -263,8 +265,10 @@ class SingleInfer(TranspileTrainer): envs.get_global_env("runner." + self._runner_name + ".print_interval", 20)) metrics_format.append("{}: {{}}".format("batch")) + metrics_indexes = dict() for name, var in metrics.items(): metrics_varnames.append(var.name) + metrics_indexes[var.name] = len(metrics_varnames) - 1 metrics_format.append("{}: {{}}".format(name)) metrics_format = ", ".join(metrics_format) @@ -272,19 +276,30 @@ class SingleInfer(TranspileTrainer): reader.start() batch_id = 0 scope = self._model[model_name][2] + + infer_results = [] with fluid.scope_guard(scope): try: while True: metrics_rets = self._exe.run(program=program, - fetch_list=metrics_varnames) + fetch_list=metrics_varnames, + return_numpy=False) metrics = [batch_id] metrics.extend(metrics_rets) + batch_infer_result = {} + for k, v in metrics_indexes.items(): + batch_infer_result[k] = np.array(metrics_rets[ + v]).tolist() + infer_results.append(batch_infer_result) + if batch_id % fetch_period == 0 and batch_id != 0: print(metrics_format.format(*metrics)) batch_id += 1 except fluid.core.EOFException: reader.reset() + with open(model_dict['save_path'], 'w') as fout: + json.dump(infer_results, fout) def terminal(self, context): context['is_exit'] = True diff --git a/models/demo/__init__.py b/models/demo/__init__.py new file mode 100755 index 0000000000000000000000000000000000000000..abf198b97e6e818e1fbe59006f98492640bcee54 --- /dev/null +++ b/models/demo/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/models/demo/movie_recommand/__init__.py b/models/demo/movie_recommand/__init__.py new file mode 100755 index 0000000000000000000000000000000000000000..abf198b97e6e818e1fbe59006f98492640bcee54 --- /dev/null +++ b/models/demo/movie_recommand/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/models/demo/movie_recommand/data/online_user/users.dat b/models/demo/movie_recommand/data/online_user/users.dat new file mode 100644 index 0000000000000000000000000000000000000000..e9649b701a503a28d6154faeaf80357300c6ee35 --- /dev/null +++ b/models/demo/movie_recommand/data/online_user/users.dat @@ -0,0 +1,2 @@ +2181::M::25::0 +2073::F::18::4 diff --git a/models/demo/movie_recommand/data/process_ml_1m.py b/models/demo/movie_recommand/data/process_ml_1m.py new file mode 100644 index 0000000000000000000000000000000000000000..7125f6254882f79d67129aac6513900cb3b765b1 --- /dev/null +++ b/models/demo/movie_recommand/data/process_ml_1m.py @@ -0,0 +1,146 @@ +#coding=utf8 +import sys +reload(sys) +sys.setdefaultencoding('utf-8') +import random +import json +user_fea = ["userid", "gender", "age", "occupation"] +movie_fea = ["movieid", "title", "genres"] +rating_fea = ["userid", "movieid", "rating", "time"] +dict_size = 60000000 +hash_dict = dict() + +data_path = "ml-1m" +test_user_path = "online_user" + + +def process(path): + user_dict = parse_data(data_path + "/users.dat", user_fea) + movie_dict = parse_movie_data(data_path + "/movies.dat", movie_fea) + + for line in open(path): + line = line.strip() + arr = line.split("::") + userid = arr[0] + movieid = arr[1] + out_str = "time:%s\t%s\t%s\tlabel:%s" % (arr[3], user_dict[userid], + movie_dict[movieid], arr[2]) + log_id = hash(out_str) % 1000000000 + print "%s\t%s" % (log_id, out_str) + + +def parse_data(file_name, feas): + dict = {} + for line in open(file_name): + line = line.strip() + arr = line.split("::") + out_str = "" + for i in range(0, len(feas)): + out_str += "%s:%s\t" % (feas[i], arr[i]) + + dict[arr[0]] = out_str.strip() + return dict + + +def parse_movie_data(file_name, feas): + dict = {} + for line in open(file_name): + line = line.strip() + arr = line.split("::") + title_str = "" + genres_str = "" + + for term in arr[1].split(" "): + term = term.strip() + if term != "": + title_str += "%s " % (term) + for term in arr[2].split("|"): + term = term.strip() + if term != "": + genres_str += "%s " % (term) + out_str = "movieid:%s\ttitle:%s\tgenres:%s" % ( + arr[0], title_str.strip(), genres_str.strip()) + dict[arr[0]] = out_str.strip() + return dict + + +def to_hash(in_str): + feas = in_str.split(":")[0] + arr = in_str.split(":")[1] + out_str = "%s:%s" % (feas, (arr + arr[::-1] + arr[::-2] + arr[::-3])) + hash_id = hash(out_str) % dict_size + if hash_id in hash_dict and hash_dict[hash_id] != out_str: + print(hash_id, out_str, hash(out_str)) + print("conflict") + exit(-1) + + return "%s:%s" % (feas, hash_id) + + +def to_hash_list(in_str): + arr = in_str.split(":") + tmp_arr = arr[1].split(" ") + out_str = "" + for item in tmp_arr: + item = item.strip() + if item != "": + key = "%s:%s" % (arr[0], item) + out_str += "%s " % (to_hash(key)) + return out_str.strip() + + +def get_hash(path): + #0-34831 1-time:974673057 2-userid:2021 3-gender:M 4-age:25 5-occupation:0 6-movieid:1345 7-title:Carrie (1976) 8-genres:Horror 9-label:2 + for line in open(path): + arr = line.strip().split("\t") + out_str = "logid:%s %s %s %s %s %s %s %s %s %s" % \ + (arr[0], arr[1], to_hash(arr[2]), to_hash(arr[3]), to_hash(arr[4]), to_hash(arr[5]), \ + to_hash(arr[6]), to_hash_list(arr[7]), to_hash_list(arr[8]), arr[9]) + print out_str + + +def generate_online_user(): + movie_dict = parse_movie_data(data_path + "/movies.dat", movie_fea) + + with open(test_user_path + "/movies.dat", 'w') as f: + for line in open(test_user_path + "/users.dat"): + line = line.strip() + arr = line.split("::") + userid = arr[0] + for item in movie_dict: + f.write(userid + "::" + item + "::1") + f.write("\n") + + +def generate_online_data(path): + user_dict = parse_data(data_path + "/users.dat", user_fea) + movie_dict = parse_movie_data(data_path + "/movies.dat", movie_fea) + + for line in open(path): + line = line.strip() + arr = line.split("::") + userid = arr[0] + movieid = arr[1] + label = arr[2] + out_str = "time:%s\t%s\t%s\tlabel:%s" % ("1", user_dict[userid], + movie_dict[movieid], label) + log_id = hash(out_str) % 1000000000 + res = "%s\t%s" % (log_id, out_str) + arr = res.strip().split("\t") + out_str = "logid:%s %s %s %s %s %s %s %s %s %s" % \ + (arr[0], arr[1], to_hash(arr[2]), to_hash(arr[3]), to_hash(arr[4]), to_hash(arr[5]), \ + to_hash(arr[6]), to_hash_list(arr[7]), to_hash_list(arr[8]), arr[9]) + print(out_str) + + +if __name__ == "__main__": + random.seed(1111111) + if sys.argv[1] == "process_raw": + process(sys.argv[2]) + elif sys.argv[1] == "hash": + get_hash(sys.argv[2]) + elif sys.argv[1] == "data_recall": + generate_online_user() + generate_online_data(test_user_path + "/movies.dat") + elif sys.argv[1] == "data_rank": + generate_online_data(test_user_path + "/movies.dat") diff --git a/models/demo/movie_recommand/data/split.py b/models/demo/movie_recommand/data/split.py new file mode 100644 index 0000000000000000000000000000000000000000..9c0a7fd0960b5654e4ea867dd5888dfc2c471076 --- /dev/null +++ b/models/demo/movie_recommand/data/split.py @@ -0,0 +1,51 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import random + +train = dict() +test = dict() +data_path = "ml-1m" + +for line in open(data_path + "/ratings.dat"): + fea = line.rstrip().split("::") + if fea[0] not in train: + train[fea[0]] = [line] + elif fea[0] not in test: + test[fea[0]] = dict() + test[fea[0]]['time'] = int(fea[3]) + test[fea[0]]['content'] = line + else: + time = int(fea[3]) + if time <= test[fea[0]]['time']: + train[fea[0]].append(line) + else: + train[fea[0]].append(test[fea[0]]['content']) + test[fea[0]]['time'] = time + test[fea[0]]['content'] = line + +train_data = [] +for key in train: + for line in train[key]: + train_data.append(line) + +random.shuffle(train_data) + +with open(data_path + "/train.dat", 'w') as f: + for line in train_data: + f.write(line) + +with open(data_path + "/test.dat", 'w') as f: + for key in test: + f.write(test[key]['content']) diff --git a/models/demo/movie_recommand/data/test/data.txt b/models/demo/movie_recommand/data/test/data.txt new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/models/demo/movie_recommand/data/train/data.txt b/models/demo/movie_recommand/data/train/data.txt new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/models/demo/movie_recommand/data_prepare.sh b/models/demo/movie_recommand/data_prepare.sh new file mode 100644 index 0000000000000000000000000000000000000000..f99b5b273b4ed496030cfe46bf228ae32159ee26 --- /dev/null +++ b/models/demo/movie_recommand/data_prepare.sh @@ -0,0 +1,18 @@ +cd data + +wget http://files.grouplens.org/datasets/movielens/ml-1m.zip +unzip ml-1m.zip + +python split.py + +mkdir train/ +mkdir test/ + +python process_ml_1m.py process_raw ./ml-1m/train.dat | sort -t $'\t' -k 9 -n > log.data.train +python process_ml_1m.py process_raw ./ml-1m/test.dat | sort -t $'\t' -k 9 -n > log.data.test +python process_ml_1m.py hash log.data.train > ./train/data.txt +python process_ml_1m.py hash log.data.test > ./test/data.txt + +rm log.data.train +rm log.data.test +cd ../ diff --git a/models/demo/movie_recommand/offline_test.sh b/models/demo/movie_recommand/offline_test.sh new file mode 100644 index 0000000000000000000000000000000000000000..88bf29cebf25b185bcdbb13cf64db5b0984b7704 --- /dev/null +++ b/models/demo/movie_recommand/offline_test.sh @@ -0,0 +1,12 @@ +## modify config.yaml to infer mode at first + +cd recall +python -m paddlerec.run -m ./config.yaml +cd ../rank +python -m paddlerec.run -m ./config.yaml +cd .. + +echo "recall offline test result:" +python parse.py recall_offline recall/infer_result +echo "rank offline test result:" +python parse.py rank_offline rank/infer_result diff --git a/models/demo/movie_recommand/online_rank.sh b/models/demo/movie_recommand/online_rank.sh new file mode 100644 index 0000000000000000000000000000000000000000..f2f5f167493e1c35f824f0bd87a922d25f832191 --- /dev/null +++ b/models/demo/movie_recommand/online_rank.sh @@ -0,0 +1,8 @@ +cd data +python process_ml_1m.py data_rank > online_user/test/data.txt + +## modify recall/config.yaml to online_infer mode +cd ../rank +python -m paddlerec.run -m ./config.yaml +cd ../ +python parse.py rank_online rank/infer_result diff --git a/models/demo/movie_recommand/online_recall.sh b/models/demo/movie_recommand/online_recall.sh new file mode 100644 index 0000000000000000000000000000000000000000..23fa7912c2f173310da7f73694833aeaa59646df --- /dev/null +++ b/models/demo/movie_recommand/online_recall.sh @@ -0,0 +1,9 @@ +cd data +mkdir online_user/test +python process_ml_1m.py data_recall > online_user/test/data.txt + +## modify recall/config.yaml to online_infer mode +cd ../recall +python -m paddlerec.run -m ./config.yaml +cd ../ +python parse.py recall_online recall/infer_result diff --git a/models/demo/movie_recommand/parse.py b/models/demo/movie_recommand/parse.py new file mode 100644 index 0000000000000000000000000000000000000000..55cf92ec3595c1e180e1e353de49603031bcaf59 --- /dev/null +++ b/models/demo/movie_recommand/parse.py @@ -0,0 +1,176 @@ +#coding=utf8 +import sys +reload(sys) +sys.setdefaultencoding('utf-8') +import random +import json +import numpy as np +import operator + +user_fea = ["userid", "gender", "age", "occupation"] +movie_fea = ["movieid", "title", "genres"] +rating_fea = ["userid", "movieid", "rating", "time"] +dict_size = 60000000 +hash_dict = dict() + +data_path = "data/ml-1m" +test_user_path = "data/online_user" +topk = 100 + + +def read_raw_data(): + user_dict = parse_data(data_path + "/users.dat", user_fea) + movie_dict = parse_data(data_path + "/movies.dat", movie_fea) + ratings_dict = dict() + for line in open(data_path + "/ratings.dat"): + arr = line.strip().split("::") + if arr[0] not in ratings_dict: + ratings_dict[arr[0]] = [] + tmp = dict() + tmp["movieid"] = arr[1] + tmp["score"] = arr[2] + tmp["time"] = arr[3] + ratings_dict[arr[0]].append(tmp) + return user_dict, movie_dict, ratings_dict + + +def parse_data(file_name, feas): + res = {} + for line in open(file_name): + line = line.strip() + arr = line.split("::") + res[arr[0]] = dict() + _ = to_hash(feas[0], arr[0]) + for i in range(0, len(feas)): + res[arr[0]][feas[i]] = arr[i] + return res + + +def to_hash(feas, arr): + out_str = "%s:%s" % (feas, (arr + arr[::-1] + arr[::-2] + arr[::-3])) + hash_id = hash(out_str) % dict_size + if hash_id in hash_dict and hash_dict[hash_id] != out_str: + print(hash_id, out_str, hash(out_str), hash_dict[hash_id]) + print("conflict") + exit(-1) + hash_dict[hash_id] = out_str + return hash_id + + +def load_ground_truth(user_dict, movie_dict, ratings_dict): + for line in open(test_user_path + "/users.dat"): + uid = line.strip().split("::")[0] + display_user(user_dict[uid]) + ratings_dict[uid] = sorted( + ratings_dict[uid], + key=lambda i: (i["score"], i["time"]), + reverse=True) + ratings_dict[uid] = ratings_dict[uid][:topk] + for i in range(len(ratings_dict[uid])): + item = ratings_dict[uid][i] + mid = item["movieid"] + for key in movie_fea: + item[key] = movie_dict[mid][key] + display_movies(ratings_dict[uid]) + + +def load_infer_results(path, feas, movie_dict): + with open(path) as f: + content = json.load(f) + + total = 0 + correct = 0 + mae = 0.0 + + res = dict() + for item in content: + userid = reduce(operator.add, item[feas["userid"]]) + movieid = reduce(operator.add, item[feas["movieid"]]) + ratings = reduce(operator.add, item[feas["ratings"]]) + predict = map(int, ratings) + label = reduce(operator.add, item[feas["label"]]) + + mae += sum(np.square(np.array(ratings) - np.array(label))) + total += len(label) + correct += sum(np.array(predict) == np.array(label)) + + for i in range(len(userid)): + hash_uid = userid[i] + hash_mid = movieid[i] + if hash_uid not in hash_dict or hash_mid not in hash_dict: + continue + tmp = hash_dict[hash_uid].split(':')[1] + uid = tmp[:len(tmp) / 3] + tmp = hash_dict[hash_mid].split(':')[1] + mid = tmp[:len(tmp) / 3] + if uid not in res: + res[uid] = [] + item = {"score": ratings[i]} + for info in movie_dict[mid]: + item[info] = movie_dict[mid][info] + res[uid].append(item) + + for key in res: + tmp = sorted(res[key], key=lambda i: i["score"], reverse=True) + existed_movie = [] + res[key] = [] + for i in range(len(tmp)): + if len(res[key]) >= topk: + break + if tmp[i]["movieid"] not in existed_movie: + existed_movie.append(tmp[i]["movieid"]) + res[key].append(tmp[i]) + + print("total: " + str(total) + "; correct: " + str(correct)) + print("accuracy: " + str(float(correct) / total)) + print("mae: " + str(mae / total)) + return res + + +def display_user(item): + out_str = "" + for key in user_fea: + out_str += "%s:%s " % (key, item[key]) + print(out_str) + + +def display_movies(input): + for item in input: + print_str = "" + for key in movie_fea: + print_str += "%s:%s " % (key, item[key]) + print_str += "%s:%s" % ("score", item["score"]) + print(print_str) + + +def parse_infer(mode, path, user_dict, movie_dict): + stage, online = mode.split('_') + feas = { + "userid": "userid", + "movieid": "movieid", + "ratings": "scale_0.tmp_0", + "label": "label" + } + + infer_results = load_infer_results(path, feas, movie_dict) + if online.startswith("offline"): + return + + for uid in infer_results: + display_user(user_dict[uid]) + display_movies(infer_results[uid]) + + with open(test_user_path + "/movies.dat", 'w') as fout: + for uid in infer_results: + for item in infer_results[uid]: + str_ = uid + "::" + str(item["movieid"]) + "::" + str( + int(item["score"])) + "\n" + fout.write(str_) + + +if __name__ == "__main__": + user_dict, movie_dict, ratings_dict = read_raw_data() + if sys.argv[1] == "ground_truth": + load_ground_truth(user_dict, movie_dict, ratings_dict) + else: + parse_infer(sys.argv[1], sys.argv[2], user_dict, movie_dict) diff --git a/models/demo/movie_recommand/rank/__init__.py b/models/demo/movie_recommand/rank/__init__.py new file mode 100755 index 0000000000000000000000000000000000000000..abf198b97e6e818e1fbe59006f98492640bcee54 --- /dev/null +++ b/models/demo/movie_recommand/rank/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/models/demo/movie_recommand/rank/config.yaml b/models/demo/movie_recommand/rank/config.yaml new file mode 100755 index 0000000000000000000000000000000000000000..4ad9e458a3a9bbf2a6c658c2d046807581b6c1f5 --- /dev/null +++ b/models/demo/movie_recommand/rank/config.yaml @@ -0,0 +1,93 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +workspace: "paddlerec.models.demo.movie_recommand" + +# list of dataset +dataset: +- name: dataset_train # name of dataset to distinguish different datasets + batch_size: 128 + type: QueueDataset + data_path: "{workspace}/data/train" + sparse_slots: "logid time userid gender age occupation movieid title genres label" + dense_slots: "" +- name: dataset_infer # name + batch_size: 128 + type: DataLoader + data_path: "{workspace}/data/test" + sparse_slots: "logid time userid gender age occupation movieid title genres label" + dense_slots: "" +- name: dataset_online_infer # name + batch_size: 10 + type: DataLoader + data_path: "{workspace}/data/online_user/test" + sparse_slots: "logid time userid gender age occupation movieid title genres label" + dense_slots: "" + +# hyper parameters of user-defined network +hyper_parameters: + # optimizer config + optimizer: + class: Adam + learning_rate: 0.001 + strategy: async + # user-defined pairs + sparse_feature_number: 60000000 + sparse_feature_dim: 9 + dense_input_dim: 13 + fc_sizes: [512, 256, 128, 32] + +# train +mode: runner_train + +## online or offline infer +#mode: runner_infer +runner: +- name: runner_train + class: single_train + save_checkpoint_interval: 1 # save model interval of epochs + save_inference_interval: 1 # save inference + save_checkpoint_path: "increment" # save checkpoint path + save_inference_path: "inference" # save inference path + epochs: 10 + device: cpu + +- name: runner_infer + epochs: 1 + class: single_infer + print_interval: 10000 + init_model_path: "increment/9" # load model path + +#train +phase: +- name: phase1 + model: "{workspace}/model.py" # user-defined model + dataset_name: dataset_train # select dataset by name + thread_num: 12 + +##offline infer +#phase: +#- name: phase1 +# model: "{workspace}/model.py" # user-defined model +# dataset_name: dataset_infer # select dataset by name +# save_path: "./infer_result" +# thread_num: 1 + +##offline infer +#phase: +#- name: phase1 +# model: "{workspace}/model.py" # user-defined model +# dataset_name: dataset_online_infer # select dataset by name +# save_path: "./infer_result" +# thread_num: 1 diff --git a/models/demo/movie_recommand/rank/model.py b/models/demo/movie_recommand/rank/model.py new file mode 100755 index 0000000000000000000000000000000000000000..2393e3549194ab0cf7009ef71121ba2fe559c56a --- /dev/null +++ b/models/demo/movie_recommand/rank/model.py @@ -0,0 +1,120 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math + +import paddle.fluid as fluid + +from paddlerec.core.utils import envs +from paddlerec.core.model import Model as ModelBase + + +class Model(ModelBase): + def __init__(self, config): + ModelBase.__init__(self, config) + + def _init_hyper_parameters(self): + self.is_distributed = True if envs.get_trainer( + ) == "CtrTrainer" else False + self.sparse_feature_number = envs.get_global_env( + "hyper_parameters.sparse_feature_number") + self.sparse_feature_dim = envs.get_global_env( + "hyper_parameters.sparse_feature_dim") + self.learning_rate = envs.get_global_env( + "hyper_parameters.optimizer.learning_rate") + self.hidden_layers = envs.get_global_env("hyper_parameters.fc_sizes") + + def net(self, input, is_infer=False): + self.user_sparse_inputs = self._sparse_data_var[2:6] + self.mov_sparse_inputs = self._sparse_data_var[6:9] + + self.label_input = self._sparse_data_var[-1] + + def fc(input): + fcs = [input] + for size in self.hidden_layers: + output = fluid.layers.fc( + input=fcs[-1], + size=size, + act='relu', + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.Normal( + scale=1.0 / math.sqrt(fcs[-1].shape[1])))) + fcs.append(output) + return fcs[-1] + + def embedding_layer(input): + emb = fluid.layers.embedding( + input=input, + is_sparse=True, + is_distributed=self.is_distributed, + size=[self.sparse_feature_number, self.sparse_feature_dim], + param_attr=fluid.ParamAttr( + name="emb", initializer=fluid.initializer.Uniform()), ) + emb_sum = fluid.layers.sequence_pool(input=emb, pool_type='sum') + return emb_sum + + user_sparse_embed_seq = list( + map(embedding_layer, self.user_sparse_inputs)) + mov_sparse_embed_seq = list( + map(embedding_layer, self.mov_sparse_inputs)) + concated_user = fluid.layers.concat(user_sparse_embed_seq, axis=1) + concated_mov = fluid.layers.concat(mov_sparse_embed_seq, axis=1) + + usr_combined_features = fc(concated_user) + mov_combined_features = fc(concated_mov) + + fc_input = fluid.layers.concat( + [usr_combined_features, mov_combined_features], axis=1) + sim = fluid.layers.fc( + input=fc_input, + size=1, + act='sigmoid', + param_attr=fluid.ParamAttr(initializer=fluid.initializer.Normal( + scale=1.0 / math.sqrt(fc_input.shape[1])))) + + predict = fluid.layers.scale(sim, scale=5) + self.predict = predict + #auc, batch_auc, _ = fluid.layers.auc(input=self.predict, + # label=self.label_input, + # num_thresholds=10000, + # slide_steps=20) + + if is_infer: + self._infer_results["user_feature"] = usr_combined_features + self._infer_results["movie_feature"] = mov_combined_features + self._infer_results["uid"] = self._sparse_data_var[2] + self._infer_results["movieid"] = self._sparse_data_var[6] + self._infer_results["label"] = self._sparse_data_var[-1] + self._infer_results["predict"] = self.predict + return + + #self._metrics["AUC"] = auc + #self._metrics["BATCH_AUC"] = batch_auc + #cost = fluid.layers.cross_entropy( + # input=self.predict, label=self.label_input) + cost = fluid.layers.square_error_cost( + self.predict, + fluid.layers.cast( + x=self.label_input, dtype='float32')) + avg_cost = fluid.layers.reduce_mean(cost) + self._cost = avg_cost + self._metrics["LOSS"] = avg_cost + + def optimizer(self): + optimizer = fluid.optimizer.Adam(self.learning_rate, lazy_mode=True) + return optimizer + + def infer_net(self): + pass diff --git a/models/demo/movie_recommand/recall/__init__.py b/models/demo/movie_recommand/recall/__init__.py new file mode 100755 index 0000000000000000000000000000000000000000..abf198b97e6e818e1fbe59006f98492640bcee54 --- /dev/null +++ b/models/demo/movie_recommand/recall/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/models/demo/movie_recommand/recall/config.yaml b/models/demo/movie_recommand/recall/config.yaml new file mode 100755 index 0000000000000000000000000000000000000000..91575611c41fea3e1a90d1a5eb77ad4c99ea7fbe --- /dev/null +++ b/models/demo/movie_recommand/recall/config.yaml @@ -0,0 +1,93 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +workspace: "paddlerec.models.demo.movie_recommand" + +# list of dataset +dataset: +- name: dataset_train # name of dataset to distinguish different datasets + batch_size: 128 + type: QueueDataset + data_path: "{workspace}/data/train" + sparse_slots: "logid time userid gender age occupation movieid title genres label" + dense_slots: "" +- name: dataset_infer # name + batch_size: 128 + type: DataLoader + data_path: "{workspace}/data/test" + sparse_slots: "logid time userid gender age occupation movieid title genres label" + dense_slots: "" +- name: dataset_online_infer # name + batch_size: 128 + type: DataLoader + data_path: "{workspace}/data/online_user/test" + sparse_slots: "logid time userid gender age occupation movieid title genres label" + dense_slots: "" + +# hyper parameters of user-defined network +hyper_parameters: + # optimizer config + optimizer: + class: Adam + learning_rate: 0.001 + strategy: async + # user-defined pairs + sparse_feature_number: 60000000 + sparse_feature_dim: 9 + dense_input_dim: 13 + fc_sizes: [512, 256, 128, 32] + +# train +mode: runner_train + +## online or offline infer +#mode: runner_infer +runner: +- name: runner_train + class: single_train + save_checkpoint_interval: 1 # save model interval of epochs + save_inference_interval: 1 # save inference + save_checkpoint_path: "increment" # save checkpoint path + save_inference_path: "inference" # save inference path + epochs: 10 + device: cpu + +- name: runner_infer + epochs: 1 + class: single_infer + print_interval: 10000 + init_model_path: "increment/9" # load model path + +#train +phase: +- name: phase1 + model: "{workspace}/model.py" # user-defined model + dataset_name: dataset_train # select dataset by name + thread_num: 12 + +##offline infer +#phase: +#- name: phase1 +# model: "{workspace}/model.py" # user-defined model +# dataset_name: dataset_infer # select dataset by name +# save_path: "./infer_result" +# thread_num: 1 + +##offline infer +#phase: +#- name: phase1 +# model: "{workspace}/model.py" # user-defined model +# dataset_name: dataset_online_infer # select dataset by name +# save_path: "./infer_result" +# thread_num: 1 diff --git a/models/demo/movie_recommand/recall/model.py b/models/demo/movie_recommand/recall/model.py new file mode 100755 index 0000000000000000000000000000000000000000..13773ef51bd0d9383328f1e0fe268550d3b45b53 --- /dev/null +++ b/models/demo/movie_recommand/recall/model.py @@ -0,0 +1,100 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math + +import paddle.fluid as fluid + +from paddlerec.core.utils import envs +from paddlerec.core.model import Model as ModelBase + + +class Model(ModelBase): + def __init__(self, config): + ModelBase.__init__(self, config) + + def _init_hyper_parameters(self): + self.is_distributed = True if envs.get_trainer( + ) == "CtrTrainer" else False + self.sparse_feature_number = envs.get_global_env( + "hyper_parameters.sparse_feature_number") + self.sparse_feature_dim = envs.get_global_env( + "hyper_parameters.sparse_feature_dim") + self.learning_rate = envs.get_global_env( + "hyper_parameters.optimizer.learning_rate") + self.hidden_layers = envs.get_global_env("hyper_parameters.fc_sizes") + + def net(self, input, is_infer=False): + self.user_sparse_inputs = self._sparse_data_var[2:6] + self.mov_sparse_inputs = self._sparse_data_var[6:9] + + self.label_input = self._sparse_data_var[-1] + + def fc(input): + fcs = [input] + for size in self.hidden_layers: + output = fluid.layers.fc( + input=fcs[-1], + size=size, + act='relu', + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.Normal( + scale=1.0 / math.sqrt(fcs[-1].shape[1])))) + fcs.append(output) + return fcs[-1] + + def embedding_layer(input): + emb = fluid.layers.embedding( + input=input, + is_sparse=True, + is_distributed=self.is_distributed, + size=[self.sparse_feature_number, self.sparse_feature_dim], + param_attr=fluid.ParamAttr( + name="emb", initializer=fluid.initializer.Uniform()), ) + emb_sum = fluid.layers.sequence_pool(input=emb, pool_type='sum') + return emb_sum + + user_sparse_embed_seq = list( + map(embedding_layer, self.user_sparse_inputs)) + mov_sparse_embed_seq = list( + map(embedding_layer, self.mov_sparse_inputs)) + concated_user = fluid.layers.concat(user_sparse_embed_seq, axis=1) + concated_mov = fluid.layers.concat(mov_sparse_embed_seq, axis=1) + + usr_combined_features = fc(concated_user) + mov_combined_features = fc(concated_mov) + + sim = fluid.layers.cos_sim( + X=usr_combined_features, Y=mov_combined_features) + predict = fluid.layers.scale(sim, scale=5) + self.predict = predict + + if is_infer: + self._infer_results["uid"] = self._sparse_data_var[2] + self._infer_results["movieid"] = self._sparse_data_var[6] + self._infer_results["label"] = self._sparse_data_var[-1] + self._infer_results["predict"] = self.predict + return + + cost = fluid.layers.square_error_cost( + self.predict, + fluid.layers.cast( + x=self.label_input, dtype='float32')) + avg_cost = fluid.layers.reduce_mean(cost) + self._cost = avg_cost + self._metrics["LOSS"] = avg_cost + + def optimizer(self): + optimizer = fluid.optimizer.Adam(self.learning_rate, lazy_mode=True) + return optimizer diff --git a/models/demo/movie_recommand/train.sh b/models/demo/movie_recommand/train.sh new file mode 100644 index 0000000000000000000000000000000000000000..47756c1414030bf3cd5da0532198eedf19eff3e0 --- /dev/null +++ b/models/demo/movie_recommand/train.sh @@ -0,0 +1,5 @@ +cd recall +python -m paddlerec.run -m ./config.yaml &> log & +cd ../rank +python -m paddlerec.run -m ./config.yaml &> log & +cd ..