# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. #coding=utf8 import os import sys reload(sys) sys.setdefaultencoding('utf-8') import random import json user_fea = ["userid", "gender", "age", "occupation"] movie_fea = ["movieid", "title", "genres"] rating_fea = ["userid", "movieid", "rating", "time"] dict_size = 1000000 hash_dict = dict() data_path = "ml-1m" test_user_path = "online_user" def process(path, output_path): user_dict = parse_data(data_path + "/users.dat", user_fea) movie_dict = parse_movie_data(data_path + "/movies.dat", movie_fea) res = [] for line in open(path): line = line.strip() arr = line.split("::") userid = arr[0] movieid = arr[1] out_str = "time:%s\t%s\t%s\tlabel:%s" % (arr[3], user_dict[userid], movie_dict[movieid], arr[2]) log_id = hash(out_str) % 1000000000 res.append("%s\t%s" % (log_id, out_str)) with open(output_path, 'w') as fout: for line in res: fout.write(line) fout.write("\n") def parse_data(file_name, feas): dict = {} for line in open(file_name): line = line.strip() arr = line.split("::") out_str = "" for i in range(0, len(feas)): out_str += "%s:%s\t" % (feas[i], arr[i]) dict[arr[0]] = out_str.strip() return dict def parse_movie_data(file_name, feas): dict = {} for line in open(file_name): line = line.strip() arr = line.split("::") title_str = "" genres_str = "" for term in arr[1].split(" "): term = term.strip() if term != "": title_str += "%s " % (term) for term in arr[2].split("|"): term = term.strip() if term != "": genres_str += "%s " % (term) out_str = "movieid:%s\ttitle:%s\tgenres:%s" % ( arr[0], title_str.strip(), genres_str.strip()) dict[arr[0]] = out_str.strip() return dict def to_hash(in_str): feas = in_str.split(":")[0] arr = in_str.split(":")[1] out_str = "%s:%s" % (feas, (arr + arr[::-1] + arr[::-2] + arr[::-3])) hash_id = hash(out_str) % dict_size # if hash_id in hash_dict and hash_dict[hash_id] != out_str: # print(hash_id, out_str, hash(out_str)) # print("conflict") # exit(-1) return "%s:%s" % (feas, hash_id) def to_hash_list(in_str): arr = in_str.split(":") tmp_arr = arr[1].split(" ") out_str = "" for item in tmp_arr: item = item.strip() if item != "": key = "%s:%s" % (arr[0], item) out_str += "%s " % (to_hash(key)) return out_str.strip() def get_hash(path): #0-34831 1-time:974673057 2-userid:2021 3-gender:M 4-age:25 5-occupation:0 6-movieid:1345 7-title:Carrie (1976) 8-genres:Horror 9-label:2 for line in open(path): arr = line.strip().split("\t") out_str = "logid:%s %s %s %s %s %s %s %s %s %s" % \ (arr[0], arr[1], to_hash(arr[2]), to_hash(arr[3]), to_hash(arr[4]), to_hash(arr[5]), \ to_hash(arr[6]), to_hash_list(arr[7]), to_hash_list(arr[8]), arr[9]) print out_str def split(path, output_dir, num=24): contents = [] with open(path) as f: contents = f.readlines() lines_per_file = len(contents) / num print("contents: ", str(len(contents))) print("lines_per_file: ", str(lines_per_file)) for i in range(1, num + 1): with open(os.path.join(output_dir, "part_" + str(i)), 'w') as fout: data = contents[(i - 1) * lines_per_file:min(i * lines_per_file, len(contents))] for line in data: fout.write(line) if __name__ == "__main__": random.seed(1111111) if sys.argv[1] == "process_raw": process(sys.argv[2], sys.argv[3]) elif sys.argv[1] == "hash": get_hash(sys.argv[2]) elif sys.argv[1] == "split": split(sys.argv[2], sys.argv[3])