提交 07388edd 编写于 作者: M malin10

bug fix

上级 07f9c21f
......@@ -9,4 +9,4 @@ cd ..
echo "recall offline test result:"
python parse.py recall_offline recall/infer_result
echo "rank offline test result:"
python parse.py recall_offline rank/infer_result
python parse.py rank_offline rank/infer_result
#coding=utf8
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
import random
import json
import numpy as np
import operator
user_fea = ["userid", "gender", "age", "occupation"]
movie_fea = ["movieid", "title", "genres"]
rating_fea = ["userid", "movieid", "rating", "time"]
dict_size = 60000000
hash_dict = dict()
data_path = "data/ml-1m"
test_user_path = "data/online_user"
topk = 100
def read_raw_data():
user_dict = parse_data(data_path + "/users.dat", user_fea)
movie_dict = parse_data(data_path + "/movies.dat", movie_fea)
ratings_dict = dict()
for line in open(data_path + "/ratings.dat"):
arr = line.strip().split("::")
if arr[0] not in ratings_dict:
ratings_dict[arr[0]] = []
tmp = dict()
tmp["movieid"] = arr[1]
tmp["score"] = arr[2]
tmp["time"] = arr[3]
ratings_dict[arr[0]].append(tmp)
return user_dict, movie_dict, ratings_dict
def parse_data(file_name, feas):
res = {}
for line in open(file_name):
line = line.strip()
arr = line.split("::")
res[arr[0]] = dict()
_ = to_hash(feas[0], arr[0])
for i in range(0, len(feas)):
res[arr[0]][feas[i]] = arr[i]
return res
def to_hash(feas, arr):
out_str = "%s:%s" % (feas, (arr + arr[::-1] + arr[::-2] + arr[::-3]))
hash_id = hash(out_str) % dict_size
if hash_id in hash_dict and hash_dict[hash_id] != out_str:
print(hash_id, out_str, hash(out_str), hash_dict[hash_id])
print("conflict")
exit(-1)
hash_dict[hash_id] = out_str
return hash_id
def load_ground_truth(user_dict, movie_dict, ratings_dict):
for line in open(test_user_path + "/users.dat"):
uid = line.strip().split("::")[0]
display_user(user_dict[uid])
ratings_dict[uid] = sorted(
ratings_dict[uid],
key=lambda i: (i["score"], i["time"]),
reverse=True)
ratings_dict[uid] = ratings_dict[uid][:topk]
for i in range(len(ratings_dict[uid])):
item = ratings_dict[uid][i]
mid = item["movieid"]
for key in movie_fea:
item[key] = movie_dict[mid][key]
display_movies(ratings_dict[uid])
def load_infer_results(path, feas, movie_dict):
with open(path) as f:
content = json.load(f)
total = 0
correct = 0
mae = 0.0
res = dict()
for item in content:
userid = reduce(operator.add, item[feas["userid"]])
movieid = reduce(operator.add, item[feas["movieid"]])
ratings = reduce(operator.add, item[feas["ratings"]])
predict = map(int, ratings)
label = reduce(operator.add, item[feas["label"]])
mae += sum(np.square(np.array(ratings) - np.array(label)))
total += len(label)
correct += sum(np.array(predict) == np.array(label))
for i in range(len(userid)):
hash_uid = userid[i]
hash_mid = movieid[i]
if hash_uid not in hash_dict or hash_mid not in hash_dict:
continue
tmp = hash_dict[hash_uid].split(':')[1]
uid = tmp[:len(tmp) / 3]
tmp = hash_dict[hash_mid].split(':')[1]
mid = tmp[:len(tmp) / 3]
if uid not in res:
res[uid] = []
item = {"score": ratings[i]}
for info in movie_dict[mid]:
item[info] = movie_dict[mid][info]
res[uid].append(item)
for key in res:
tmp = sorted(res[key], key=lambda i: i["score"], reverse=True)
existed_movie = []
res[key] = []
for i in range(len(tmp)):
if len(res[key]) >= topk:
break
if tmp[i]["movieid"] not in existed_movie:
existed_movie.append(tmp[i]["movieid"])
res[key].append(tmp[i])
print("total: " + str(total) + "; correct: " + str(correct))
print("accuracy: " + str(float(correct) / total))
print("mae: " + str(mae / total))
return res
def display_user(item):
out_str = ""
for key in user_fea:
out_str += "%s:%s " % (key, item[key])
print(out_str)
def display_movies(input):
for item in input:
print_str = ""
for key in movie_fea:
print_str += "%s:%s " % (key, item[key])
print_str += "%s:%s" % ("score", item["score"])
print(print_str)
def parse_infer(mode, path, user_dict, movie_dict):
stage, online = mode.split('_')
feas = {
"userid": "userid",
"movieid": "movieid",
"ratings": "scale_0.tmp_0",
"label": "label"
}
infer_results = load_infer_results(path, feas, movie_dict)
if online.startswith("offline"):
return
for uid in infer_results:
display_user(user_dict[uid])
display_movies(infer_results[uid])
with open(test_user_path + "/movies.dat", 'w') as fout:
for uid in infer_results:
for item in infer_results[uid]:
str_ = uid + "::" + str(item["movieid"]) + "::" + str(
int(item["score"])) + "\n"
fout.write(str_)
if __name__ == "__main__":
user_dict, movie_dict, ratings_dict = read_raw_data()
if sys.argv[1] == "ground_truth":
load_ground_truth(user_dict, movie_dict, ratings_dict)
else:
parse_infer(sys.argv[1], sys.argv[2], user_dict, movie_dict)
cd recall
python -m paddlerec.run -m ./config.yaml
python -m paddlerec.run -m ./config.yaml &> log &
cd ../rank
python -m paddlerec.run -m ./config.yaml &> train_log &
python -m paddlerec.run -m ./config.yaml &> log &
cd ..
echo "recall offline test: "
python infer_analys
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册