# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ This file provides the Dataset for LINE model. """ import os import io import sys import numpy as np from pgl import graph from pgl.utils.logger import log class FlickrDataset(object): """Flickr dataset implementation Args: name: The name of the dataset. symmetry_edges: Whether to create symmetry edges. self_loop: Whether to contain self loop edges. train_percentage: The percentage of nodes to be trained in multi class task. Attributes: graph: The :code:`Graph` data object. num_groups: Number of classes. train_index: The index for nodes in training set. test_index: The index for nodes in validation set. """ def __init__(self, data_path, symmetry_edges=False, self_loop=False, train_percentage=0.5): self.path = data_path # self.name = name self.num_groups = 5 self.symmetry_edges = symmetry_edges self.self_loop = self_loop self.train_percentage = train_percentage self._load_data() def _load_data(self): edge_path = os.path.join(self.path, 'edges.txt') node_path = os.path.join(self.path, 'nodes.txt') nodes_label_path = os.path.join(self.path, 'nodes_label.txt') all_edges = [] edges_weight = [] with io.open(node_path) as inf: num_nodes = len(inf.readlines()) node_feature = np.zeros((num_nodes, self.num_groups)) with io.open(nodes_label_path) as inf: for line in inf: # group_id means the label of the node node_id, group_id = line.strip('\n').split(',') node_id = int(node_id) - 1 labels = group_id.split(' ') for i in labels: node_feature[node_id][int(i) - 1] = 1 node_degree_list = [1 for _ in range(num_nodes)] with io.open(edge_path) as inf: for line in inf: items = line.strip().split('\t') if len(items) == 2: u, v = int(items[0]), int(items[1]) weight = 1 # binary weight, default set to 1 else: u, v, weight = int(items[0]), int(items[1]), float(items[ 2]), u, v = u - 1, v - 1 all_edges.append((u, v)) edges_weight.append(weight) if self.symmetry_edges: all_edges.append((v, u)) edges_weight.append(weight) # sum the weights of the same node as the outdegree node_degree_list[u] += weight if self.self_loop: for i in range(num_nodes): all_edges.append((i, i)) edges_weight.append(1.) all_edges = list(set(all_edges)) self.graph = graph.Graph( num_nodes=num_nodes, edges=all_edges, node_feat={"group_id": node_feature}) perm = np.arange(0, num_nodes) np.random.shuffle(perm) train_num = int(num_nodes * self.train_percentage) self.train_index = perm[:train_num] self.test_index = perm[train_num:] edge_distribution = np.array(edges_weight, dtype=np.float32) self.edge_distribution = edge_distribution / np.sum(edge_distribution) self.edge_sampling = AliasSampling(prob=edge_distribution) node_dist = np.array(node_degree_list, dtype=np.float32) node_negative_distribution = np.power(node_dist, 0.75) self.node_negative_distribution = node_negative_distribution / np.sum( node_negative_distribution) self.node_sampling = AliasSampling(prob=node_negative_distribution) self.node_index = {} self.node_index_reversed = {} for index, e in enumerate(self.graph.edges): self.node_index[e[0]] = index self.node_index_reversed[index] = e[0] def fetch_batch(self, batch_size=16, K=10, edge_sampling='alias', node_sampling='alias'): """Fetch batch data from dataset. """ if edge_sampling == 'numpy': edge_batch_index = np.random.choice( self.graph.num_edges, size=batch_size, p=self.edge_distribution) elif edge_sampling == 'alias': edge_batch_index = self.edge_sampling.sampling(batch_size) elif edge_sampling == 'uniform': edge_batch_index = np.random.randint( 0, self.graph.num_edges, size=batch_size) u_i = [] u_j = [] label = [] for edge_index in edge_batch_index: edge = self.graph.edges[edge_index] u_i.append(edge[0]) u_j.append(edge[1]) label.append(1) for i in range(K): while True: if node_sampling == 'numpy': negative_node = np.random.choice( self.graph.num_nodes, p=self.node_negative_distribution) elif node_sampling == 'alias': negative_node = self.node_sampling.sampling() elif node_sampling == 'uniform': negative_node = np.random.randint(0, self.graph.num_nodes) # make sure the sampled node has no edge with the source node if not self.graph.has_edges_between( np.array( [self.node_index_reversed[negative_node]]), np.array([self.node_index_reversed[edge[0]]])): break u_i.append(edge[0]) u_j.append(negative_node) label.append(-1) u_i = np.array([u_i], dtype=np.int64).T u_j = np.array([u_j], dtype=np.int64).T label = np.array(label, dtype=np.float32) return u_i, u_j, label class AliasSampling: """Implemention of Alias-Method This is an implementation of Alias-Method for sampling efficiently from a discrete probability distribution. Reference: https://en.wikipedia.org/wiki/Alias_method Args: prob: The discrete probability distribution. """ def __init__(self, prob): self.n = len(prob) self.U = np.array(prob) * self.n self.K = [i for i in range(len(prob))] overfull, underfull = [], [] for i, U_i in enumerate(self.U): if U_i > 1: overfull.append(i) elif U_i < 1: underfull.append(i) while len(overfull) and len(underfull): i, j = overfull.pop(), underfull.pop() self.K[j] = i self.U[i] = self.U[i] - (1 - self.U[j]) if self.U[i] > 1: overfull.append(i) elif self.U[i] < 1: underfull.append(i) def sampling(self, n=1): """Sampling. """ x = np.random.rand(n) i = np.floor(self.n * x) y = self.n * x - i i = i.astype(np.int64) res = [i[k] if y[k] < self.U[i[k]] else self.K[i[k]] for k in range(n)] if n == 1: return res[0] else: return res