From 07388edddda839f16e7859b62f2938d9bd0c3e75 Mon Sep 17 00:00:00 2001 From: malin10 Date: Sat, 6 Jun 2020 10:56:18 +0800 Subject: [PATCH] bug fix --- models/demo/movie_recommand/offline_test.sh | 2 +- models/demo/movie_recommand/parse.py | 176 ++++++++++++++++++++ models/demo/movie_recommand/train.sh | 7 +- 3 files changed, 179 insertions(+), 6 deletions(-) create mode 100644 models/demo/movie_recommand/parse.py diff --git a/models/demo/movie_recommand/offline_test.sh b/models/demo/movie_recommand/offline_test.sh index f4b83c51..88bf29ce 100644 --- a/models/demo/movie_recommand/offline_test.sh +++ b/models/demo/movie_recommand/offline_test.sh @@ -9,4 +9,4 @@ cd .. echo "recall offline test result:" python parse.py recall_offline recall/infer_result echo "rank offline test result:" -python parse.py recall_offline rank/infer_result +python parse.py rank_offline rank/infer_result diff --git a/models/demo/movie_recommand/parse.py b/models/demo/movie_recommand/parse.py new file mode 100644 index 00000000..55cf92ec --- /dev/null +++ b/models/demo/movie_recommand/parse.py @@ -0,0 +1,176 @@ +#coding=utf8 +import sys +reload(sys) +sys.setdefaultencoding('utf-8') +import random +import json +import numpy as np +import operator + +user_fea = ["userid", "gender", "age", "occupation"] +movie_fea = ["movieid", "title", "genres"] +rating_fea = ["userid", "movieid", "rating", "time"] +dict_size = 60000000 +hash_dict = dict() + +data_path = "data/ml-1m" +test_user_path = "data/online_user" +topk = 100 + + +def read_raw_data(): + user_dict = parse_data(data_path + "/users.dat", user_fea) + movie_dict = parse_data(data_path + "/movies.dat", movie_fea) + ratings_dict = dict() + for line in open(data_path + "/ratings.dat"): + arr = line.strip().split("::") + if arr[0] not in ratings_dict: + ratings_dict[arr[0]] = [] + tmp = dict() + tmp["movieid"] = arr[1] + tmp["score"] = arr[2] + tmp["time"] = arr[3] + ratings_dict[arr[0]].append(tmp) + return user_dict, movie_dict, ratings_dict + + +def parse_data(file_name, feas): + res = {} + for line in open(file_name): + line = line.strip() + arr = line.split("::") + res[arr[0]] = dict() + _ = to_hash(feas[0], arr[0]) + for i in range(0, len(feas)): + res[arr[0]][feas[i]] = arr[i] + return res + + +def to_hash(feas, arr): + out_str = "%s:%s" % (feas, (arr + arr[::-1] + arr[::-2] + arr[::-3])) + hash_id = hash(out_str) % dict_size + if hash_id in hash_dict and hash_dict[hash_id] != out_str: + print(hash_id, out_str, hash(out_str), hash_dict[hash_id]) + print("conflict") + exit(-1) + hash_dict[hash_id] = out_str + return hash_id + + +def load_ground_truth(user_dict, movie_dict, ratings_dict): + for line in open(test_user_path + "/users.dat"): + uid = line.strip().split("::")[0] + display_user(user_dict[uid]) + ratings_dict[uid] = sorted( + ratings_dict[uid], + key=lambda i: (i["score"], i["time"]), + reverse=True) + ratings_dict[uid] = ratings_dict[uid][:topk] + for i in range(len(ratings_dict[uid])): + item = ratings_dict[uid][i] + mid = item["movieid"] + for key in movie_fea: + item[key] = movie_dict[mid][key] + display_movies(ratings_dict[uid]) + + +def load_infer_results(path, feas, movie_dict): + with open(path) as f: + content = json.load(f) + + total = 0 + correct = 0 + mae = 0.0 + + res = dict() + for item in content: + userid = reduce(operator.add, item[feas["userid"]]) + movieid = reduce(operator.add, item[feas["movieid"]]) + ratings = reduce(operator.add, item[feas["ratings"]]) + predict = map(int, ratings) + label = reduce(operator.add, item[feas["label"]]) + + mae += sum(np.square(np.array(ratings) - np.array(label))) + total += len(label) + correct += sum(np.array(predict) == np.array(label)) + + for i in range(len(userid)): + hash_uid = userid[i] + hash_mid = movieid[i] + if hash_uid not in hash_dict or hash_mid not in hash_dict: + continue + tmp = hash_dict[hash_uid].split(':')[1] + uid = tmp[:len(tmp) / 3] + tmp = hash_dict[hash_mid].split(':')[1] + mid = tmp[:len(tmp) / 3] + if uid not in res: + res[uid] = [] + item = {"score": ratings[i]} + for info in movie_dict[mid]: + item[info] = movie_dict[mid][info] + res[uid].append(item) + + for key in res: + tmp = sorted(res[key], key=lambda i: i["score"], reverse=True) + existed_movie = [] + res[key] = [] + for i in range(len(tmp)): + if len(res[key]) >= topk: + break + if tmp[i]["movieid"] not in existed_movie: + existed_movie.append(tmp[i]["movieid"]) + res[key].append(tmp[i]) + + print("total: " + str(total) + "; correct: " + str(correct)) + print("accuracy: " + str(float(correct) / total)) + print("mae: " + str(mae / total)) + return res + + +def display_user(item): + out_str = "" + for key in user_fea: + out_str += "%s:%s " % (key, item[key]) + print(out_str) + + +def display_movies(input): + for item in input: + print_str = "" + for key in movie_fea: + print_str += "%s:%s " % (key, item[key]) + print_str += "%s:%s" % ("score", item["score"]) + print(print_str) + + +def parse_infer(mode, path, user_dict, movie_dict): + stage, online = mode.split('_') + feas = { + "userid": "userid", + "movieid": "movieid", + "ratings": "scale_0.tmp_0", + "label": "label" + } + + infer_results = load_infer_results(path, feas, movie_dict) + if online.startswith("offline"): + return + + for uid in infer_results: + display_user(user_dict[uid]) + display_movies(infer_results[uid]) + + with open(test_user_path + "/movies.dat", 'w') as fout: + for uid in infer_results: + for item in infer_results[uid]: + str_ = uid + "::" + str(item["movieid"]) + "::" + str( + int(item["score"])) + "\n" + fout.write(str_) + + +if __name__ == "__main__": + user_dict, movie_dict, ratings_dict = read_raw_data() + if sys.argv[1] == "ground_truth": + load_ground_truth(user_dict, movie_dict, ratings_dict) + else: + parse_infer(sys.argv[1], sys.argv[2], user_dict, movie_dict) diff --git a/models/demo/movie_recommand/train.sh b/models/demo/movie_recommand/train.sh index 4eb53f45..47756c14 100644 --- a/models/demo/movie_recommand/train.sh +++ b/models/demo/movie_recommand/train.sh @@ -1,8 +1,5 @@ cd recall -python -m paddlerec.run -m ./config.yaml +python -m paddlerec.run -m ./config.yaml &> log & cd ../rank -python -m paddlerec.run -m ./config.yaml &> train_log & +python -m paddlerec.run -m ./config.yaml &> log & cd .. - -echo "recall offline test: " -python infer_analys -- GitLab