From 07388edddda839f16e7859b62f2938d9bd0c3e75 Mon Sep 17 00:00:00 2001
From: malin10 <malin10@baidu.com>
Date: Sat, 6 Jun 2020 10:56:18 +0800
Subject: [PATCH] bug fix

---
 models/demo/movie_recommand/offline_test.sh |   2 +-
 models/demo/movie_recommand/parse.py        | 176 ++++++++++++++++++++
 models/demo/movie_recommand/train.sh        |   7 +-
 3 files changed, 179 insertions(+), 6 deletions(-)
 create mode 100644 models/demo/movie_recommand/parse.py

diff --git a/models/demo/movie_recommand/offline_test.sh b/models/demo/movie_recommand/offline_test.sh
index f4b83c51..88bf29ce 100644
--- a/models/demo/movie_recommand/offline_test.sh
+++ b/models/demo/movie_recommand/offline_test.sh
@@ -9,4 +9,4 @@ cd ..
 echo "recall offline test result:"
 python parse.py recall_offline recall/infer_result
 echo "rank offline test result:"
-python parse.py recall_offline rank/infer_result
+python parse.py rank_offline rank/infer_result
diff --git a/models/demo/movie_recommand/parse.py b/models/demo/movie_recommand/parse.py
new file mode 100644
index 00000000..55cf92ec
--- /dev/null
+++ b/models/demo/movie_recommand/parse.py
@@ -0,0 +1,176 @@
+#coding=utf8
+import sys
+reload(sys)
+sys.setdefaultencoding('utf-8')
+import random
+import json
+import numpy as np
+import operator
+
+user_fea = ["userid", "gender", "age", "occupation"]
+movie_fea = ["movieid", "title", "genres"]
+rating_fea = ["userid", "movieid", "rating", "time"]
+dict_size = 60000000
+hash_dict = dict()
+
+data_path = "data/ml-1m"
+test_user_path = "data/online_user"
+topk = 100
+
+
+def read_raw_data():
+    user_dict = parse_data(data_path + "/users.dat", user_fea)
+    movie_dict = parse_data(data_path + "/movies.dat", movie_fea)
+    ratings_dict = dict()
+    for line in open(data_path + "/ratings.dat"):
+        arr = line.strip().split("::")
+        if arr[0] not in ratings_dict:
+            ratings_dict[arr[0]] = []
+        tmp = dict()
+        tmp["movieid"] = arr[1]
+        tmp["score"] = arr[2]
+        tmp["time"] = arr[3]
+        ratings_dict[arr[0]].append(tmp)
+    return user_dict, movie_dict, ratings_dict
+
+
+def parse_data(file_name, feas):
+    res = {}
+    for line in open(file_name):
+        line = line.strip()
+        arr = line.split("::")
+        res[arr[0]] = dict()
+        _ = to_hash(feas[0], arr[0])
+        for i in range(0, len(feas)):
+            res[arr[0]][feas[i]] = arr[i]
+    return res
+
+
+def to_hash(feas, arr):
+    out_str = "%s:%s" % (feas, (arr + arr[::-1] + arr[::-2] + arr[::-3]))
+    hash_id = hash(out_str) % dict_size
+    if hash_id in hash_dict and hash_dict[hash_id] != out_str:
+        print(hash_id, out_str, hash(out_str), hash_dict[hash_id])
+        print("conflict")
+        exit(-1)
+    hash_dict[hash_id] = out_str
+    return hash_id
+
+
+def load_ground_truth(user_dict, movie_dict, ratings_dict):
+    for line in open(test_user_path + "/users.dat"):
+        uid = line.strip().split("::")[0]
+        display_user(user_dict[uid])
+        ratings_dict[uid] = sorted(
+            ratings_dict[uid],
+            key=lambda i: (i["score"], i["time"]),
+            reverse=True)
+        ratings_dict[uid] = ratings_dict[uid][:topk]
+        for i in range(len(ratings_dict[uid])):
+            item = ratings_dict[uid][i]
+            mid = item["movieid"]
+            for key in movie_fea:
+                item[key] = movie_dict[mid][key]
+        display_movies(ratings_dict[uid])
+
+
+def load_infer_results(path, feas, movie_dict):
+    with open(path) as f:
+        content = json.load(f)
+
+    total = 0
+    correct = 0
+    mae = 0.0
+
+    res = dict()
+    for item in content:
+        userid = reduce(operator.add, item[feas["userid"]])
+        movieid = reduce(operator.add, item[feas["movieid"]])
+        ratings = reduce(operator.add, item[feas["ratings"]])
+        predict = map(int, ratings)
+        label = reduce(operator.add, item[feas["label"]])
+
+        mae += sum(np.square(np.array(ratings) - np.array(label)))
+        total += len(label)
+        correct += sum(np.array(predict) == np.array(label))
+
+        for i in range(len(userid)):
+            hash_uid = userid[i]
+            hash_mid = movieid[i]
+            if hash_uid not in hash_dict or hash_mid not in hash_dict:
+                continue
+            tmp = hash_dict[hash_uid].split(':')[1]
+            uid = tmp[:len(tmp) / 3]
+            tmp = hash_dict[hash_mid].split(':')[1]
+            mid = tmp[:len(tmp) / 3]
+            if uid not in res:
+                res[uid] = []
+            item = {"score": ratings[i]}
+            for info in movie_dict[mid]:
+                item[info] = movie_dict[mid][info]
+            res[uid].append(item)
+
+    for key in res:
+        tmp = sorted(res[key], key=lambda i: i["score"], reverse=True)
+        existed_movie = []
+        res[key] = []
+        for i in range(len(tmp)):
+            if len(res[key]) >= topk:
+                break
+            if tmp[i]["movieid"] not in existed_movie:
+                existed_movie.append(tmp[i]["movieid"])
+                res[key].append(tmp[i])
+
+    print("total: " + str(total) + "; correct: " + str(correct))
+    print("accuracy: " + str(float(correct) / total))
+    print("mae: " + str(mae / total))
+    return res
+
+
+def display_user(item):
+    out_str = ""
+    for key in user_fea:
+        out_str += "%s:%s " % (key, item[key])
+    print(out_str)
+
+
+def display_movies(input):
+    for item in input:
+        print_str = ""
+        for key in movie_fea:
+            print_str += "%s:%s " % (key, item[key])
+        print_str += "%s:%s" % ("score", item["score"])
+        print(print_str)
+
+
+def parse_infer(mode, path, user_dict, movie_dict):
+    stage, online = mode.split('_')
+    feas = {
+        "userid": "userid",
+        "movieid": "movieid",
+        "ratings": "scale_0.tmp_0",
+        "label": "label"
+    }
+
+    infer_results = load_infer_results(path, feas, movie_dict)
+    if online.startswith("offline"):
+        return
+
+    for uid in infer_results:
+        display_user(user_dict[uid])
+        display_movies(infer_results[uid])
+
+    with open(test_user_path + "/movies.dat", 'w') as fout:
+        for uid in infer_results:
+            for item in infer_results[uid]:
+                str_ = uid + "::" + str(item["movieid"]) + "::" + str(
+                    int(item["score"])) + "\n"
+                fout.write(str_)
+
+
+if __name__ == "__main__":
+    user_dict, movie_dict, ratings_dict = read_raw_data()
+    if sys.argv[1] == "ground_truth":
+        load_ground_truth(user_dict, movie_dict, ratings_dict)
+    else:
+        parse_infer(sys.argv[1], sys.argv[2], user_dict, movie_dict)
diff --git a/models/demo/movie_recommand/train.sh b/models/demo/movie_recommand/train.sh
index 4eb53f45..47756c14 100644
--- a/models/demo/movie_recommand/train.sh
+++ b/models/demo/movie_recommand/train.sh
@@ -1,8 +1,5 @@
 cd recall
-python -m paddlerec.run -m ./config.yaml
+python -m paddlerec.run -m ./config.yaml &> log &
 cd ../rank
-python -m paddlerec.run -m ./config.yaml &> train_log &
+python -m paddlerec.run -m ./config.yaml &> log &
 cd ..
-
-echo "recall offline test: "
-python infer_analys
-- 
GitLab