# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ This file preprocess the FlickrDataset for LINE model. """ import argparse import operator import os def process_data(groupsMemberships_file, flickr_links_file, users_label_file, edges_file, users_file): """Preprocess flickr network dataset. Args: groupsMemberships_file: flickr-groupmemberships.txt file, each line is a pair (user, group), which indicates a user belongs to a group. flickr_links_file: flickr-links.txt file, each line is a pair (user, user), which indicates the two users have a relationship. users_label_file: each line is a pair (user, list of group), each user may belong to multiple groups. edges_file: each line is a pair (user, user), which indicates the two users have a relationship. It filters some unused edges. users_file: each line is a int number, which indicates the ID of a user. """ group2users = {} with open(groupsMemberships_file, 'r') as f: for line in f: user, group = line.strip().split() try: group2users[int(group)].append(user) except: group2users[int(group)] = [user] # counting how many users belong to every group group2usersNum = {} for key, item in group2users.items(): group2usersNum[key] = len(item) groups_sorted_by_usersNum = sorted( group2usersNum.items(), key=operator.itemgetter(1), reverse=True) # the paper only need the 5 groups with the largest number of users label = 1 # remapping the 5 groups from 1 to 5 users_label = {} for i in range(5): users_list = group2users[groups_sorted_by_usersNum[i][0]] for user in users_list: # one user may have multi-labels try: users_label[user].append(label) except: users_label[user] = [label] label += 1 # remapping the users IDs to make the IDs from 0 to N userID2nodeID = {} count = 1 for key in sorted(users_label.keys()): userID2nodeID[key] = count count += 1 with open(users_label_file, 'w') as writer: for key in sorted(users_label.keys()): line = ' '.join([str(i) for i in users_label[key]]) writer.write(str(userID2nodeID[key]) + ',' + line + '\n') # produce edges file with open(flickr_links_file, 'r') as reader, open(edges_file, 'w') as writer: for line in reader: src, dst = line.strip().split('\t') # filter unused user IDs if src in users_label and dst in users_label: # remapping the users IDs src = userID2nodeID[src] dst = userID2nodeID[dst] writer.write(str(src) + '\t' + str(dst) + '\n') # produce nodes file with open(users_file, 'w') as writer: for i in range(1, 1 + len(userID2nodeID)): writer.write(str(i) + '\n') if __name__ == "__main__": parser = argparse.ArgumentParser(description='LINE') parser.add_argument( '--groupmemberships', type=str, default='./data/flickr/flickr-groupmemberships.txt', help='groupmemberships of flickr dataset') parser.add_argument( '--flickr_links', type=str, default='./data/flickr/flickr-links.txt', help='the flickr-links.txt file for training') parser.add_argument( '--nodes_label', type=str, default='./data/flickr/nodes_label.txt', help='nodes (users) label file for training') parser.add_argument( '--edges', type=str, default='./data/flickr/edges.txt', help='the result edges (links) file for training') parser.add_argument( '--nodes', type=str, default='./data/flickr/nodes.txt', help='the nodes (users) file for training') args = parser.parse_args() process_data(args.groupmemberships, args.flickr_links, args.nodes_label, args.edges, args.nodes)