process_ml_1m.py

#coding=utf8
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
import random
import json
user_fea = ["userid", "gender", "age", "occupation"]
movie_fea = ["movieid", "title", "genres"]
rating_fea = ["userid", "movieid", "rating", "time"]
dict_size = 60000000
hash_dict = dict()

data_path = "ml-1m"
test_user_path = "online_user"


def process(path):
    user_dict = parse_data(data_path + "/users.dat", user_fea)
    movie_dict = parse_movie_data(data_path + "/movies.dat", movie_fea)

    for line in open(path):
        line = line.strip()
        arr = line.split("::")
        userid = arr[0]
        movieid = arr[1]
        out_str = "time:%s\t%s\t%s\tlabel:%s" % (arr[3], user_dict[userid],
                                                 movie_dict[movieid], arr[2])
        log_id = hash(out_str) % 1000000000
        print "%s\t%s" % (log_id, out_str)


def parse_data(file_name, feas):
    dict = {}
    for line in open(file_name):
        line = line.strip()
        arr = line.split("::")
        out_str = ""
        for i in range(0, len(feas)):
            out_str += "%s:%s\t" % (feas[i], arr[i])

        dict[arr[0]] = out_str.strip()
    return dict


def parse_movie_data(file_name, feas):
    dict = {}
    for line in open(file_name):
        line = line.strip()
        arr = line.split("::")
        title_str = ""
        genres_str = ""

        for term in arr[1].split(" "):
            term = term.strip()
            if term != "":
                title_str += "%s " % (term)
        for term in arr[2].split("|"):
            term = term.strip()
            if term != "":
                genres_str += "%s " % (term)
        out_str = "movieid:%s\ttitle:%s\tgenres:%s" % (
            arr[0], title_str.strip(), genres_str.strip())
        dict[arr[0]] = out_str.strip()
    return dict


def to_hash(in_str):
    feas = in_str.split(":")[0]
    arr = in_str.split(":")[1]
    out_str = "%s:%s" % (feas, (arr + arr[::-1] + arr[::-2] + arr[::-3]))
    hash_id = hash(out_str) % dict_size
    if hash_id in hash_dict and hash_dict[hash_id] != out_str:
        print(hash_id, out_str, hash(out_str))
        print("conflict")
        exit(-1)

    return "%s:%s" % (feas, hash_id)


def to_hash_list(in_str):
    arr = in_str.split(":")
    tmp_arr = arr[1].split(" ")
    out_str = ""
    for item in tmp_arr:
        item = item.strip()
        if item != "":
            key = "%s:%s" % (arr[0], item)
            out_str += "%s " % (to_hash(key))
    return out_str.strip()


def get_hash(path):
    #0-34831 1-time:974673057 2-userid:2021 3-gender:M 4-age:25 5-occupation:0 6-movieid:1345  7-title:Carrie (1976)  8-genres:Horror  9-label:2
    for line in open(path):
        arr = line.strip().split("\t")
        out_str = "logid:%s %s %s %s %s %s %s %s %s %s" % \
                 (arr[0], arr[1], to_hash(arr[2]), to_hash(arr[3]), to_hash(arr[4]), to_hash(arr[5]), \
                 to_hash(arr[6]), to_hash_list(arr[7]), to_hash_list(arr[8]), arr[9])
        print out_str


def generate_online_user():
    movie_dict = parse_movie_data(data_path + "/movies.dat", movie_fea)

    with open(test_user_path + "/movies.dat", 'w') as f:
        for line in open(test_user_path + "/users.dat"):
            line = line.strip()
            arr = line.split("::")
            userid = arr[0]
            for item in movie_dict:
                f.write(userid + "::" + item + "::1")
                f.write("\n")


def generate_online_data(path):
    user_dict = parse_data(data_path + "/users.dat", user_fea)
    movie_dict = parse_movie_data(data_path + "/movies.dat", movie_fea)

    for line in open(path):
        line = line.strip()
        arr = line.split("::")
        userid = arr[0]
        movieid = arr[1]
        label = arr[2]
        out_str = "time:%s\t%s\t%s\tlabel:%s" % ("1", user_dict[userid],
                                                 movie_dict[movieid], label)
        log_id = hash(out_str) % 1000000000
        res = "%s\t%s" % (log_id, out_str)
        arr = res.strip().split("\t")
        out_str = "logid:%s %s %s %s %s %s %s %s %s %s" % \
                (arr[0], arr[1], to_hash(arr[2]), to_hash(arr[3]), to_hash(arr[4]), to_hash(arr[5]), \
                to_hash(arr[6]), to_hash_list(arr[7]), to_hash_list(arr[8]), arr[9])
        print(out_str)


if __name__ == "__main__":
    random.seed(1111111)
    if sys.argv[1] == "process_raw":
        process(sys.argv[2])
    elif sys.argv[1] == "hash":
        get_hash(sys.argv[2])
    elif sys.argv[1] == "data_recall":
        generate_online_user()
        generate_online_data(test_user_path + "/movies.dat")
    elif sys.argv[1] == "data_rank":
        generate_online_data(test_user_path + "/movies.dat")