add pinsage sampling

b46b2b1a · liweibin · 0bd10e14 · b46b2b1a
隐藏空白更改
内联并排

Showing with 163 addition and 2 deletion

pgl/sample.py pgl/sample.py +163 -2

未找到文件。
--- a/pgl/sample.py
+++ b/pgl/sample.py
@@ -24,10 +24,29 @@ from pgl import graph_kernel

 __all__ = [
    'graphsage_sample', 'node2vec_sample', 'deepwalk_sample',
-    'metapath_randomwalk'
+    'metapath_randomwalk', 'pinsage_sample'
 ]


+def traverse(item):
+    """traverse the list or numpy"""
+    if isinstance(item, list) or isinstance(item, np.ndarray):
+        for i in iter(item):
+            for j in traverse(i):
+                yield j
+    else:
+        yield item
+
+
+def flat_node_and_edge(nodes, eids, weights=None):
+    """flatten the sub-lists to one list"""
+    nodes = list(set(traverse(nodes)))
+    eids = list(traverse(eids))
+    if weights is not None:
+        weights = list(traverse(weights))
+    return nodes, eids, weights
+
+
 def edge_hash(src, dst):
    """edge_hash
    """
@@ -88,7 +107,6 @@ def graphsage_sample(graph, nodes, samples, ignore_edges=[]):
        start_nodes = list(nodes_set - last_nodes_set)
        layer_nodes = [nodes] + layer_nodes
        layer_eids = [eids] + layer_eids
-        log.debug("flat time: %s" % (time.time() - start))
        start = time.time()
        # Find new nodes

@@ -317,3 +335,146 @@ def metapath_randomwalk(graph,
        cur_nodes = np.array(nxt_cur_nodes)

    return walk
+
+
+def random_walk_with_start_prob(graph, nodes, max_depth, proba=0.5):
+    """Implement of random walk with the probability of returning the origin node.
+
+    This function get random walks path for given nodes and depth.
+
+    Args:
+        nodes: Walk starting from nodes
+        max_depth: Max walking depth
+        proba: the proba to return the origin node
+
+    Return:
+        A list of walks.
+    """
+    walk = []
+    # init
+    for node in nodes:
+        walk.append([node])
+
+    walk_ids = np.arange(0, len(nodes))
+    cur_nodes = np.array(nodes)
+    nodes = np.array(nodes)
+    for l in range(max_depth):
+        # select the walks not end
+        if l >= 1:
+            return_proba = np.random.rand(cur_nodes.shape[0])
+            proba_mask = (return_proba < proba)
+            cur_nodes[proba_mask] = nodes[proba_mask]
+        outdegree = graph.outdegree(cur_nodes)
+        mask = (outdegree != 0)
+        if np.any(mask):
+            cur_walk_ids = walk_ids[mask]
+            outdegree = outdegree[mask]
+        else:
+            # stop when all nodes have no successor, wait start next loop to get precesssor
+            continue
+        succ = graph.successor(cur_nodes[mask])
+        sample_index = np.floor(
+            np.random.rand(outdegree.shape[0]) * outdegree).astype("int64")
+
+        nxt_cur_nodes = cur_nodes
+        for s, ind, walk_id in zip(succ, sample_index, cur_walk_ids):
+            walk[walk_id].append(s[ind])
+            nxt_cur_nodes[walk_id] = s[ind]
+        cur_nodes = np.array(nxt_cur_nodes)
+    return walk
+
+
+def pinsage_sample(graph,
+                   nodes,
+                   samples,
+                   top_k=10,
+                   proba=0.5,
+                   norm_bais=1.0,
+                   ignore_edges=set()):
+    """Implement of graphsage sample.
+    
+    Reference paper: .
+
+    Args:
+        graph: A pgl graph instance
+        nodes: Sample starting from nodes
+        samples: A list, number of neighbors in each layer
+        top_k: select the top_k visit count nodes to construct the edges 
+        proba: the probability to return the origin node 
+        norm_bais: the normlization for the visit count
+        ignore_edges: list of edge(src, dst) will be ignored.
+    
+    Return:
+        A list of subgraphs
+    """
+    start = time.time()
+    num_layers = len(samples)
+    start_nodes = nodes
+    edges, weights = [], []
+    layer_nodes, layer_edges, layer_weights = [], [], []
+    ignore_edge_set = set([edge_hash(src, dst) for src, dst in ignore_edges])
+
+    for layer_idx in reversed(range(num_layers)):
+        if len(start_nodes) == 0:
+            layer_nodes = [nodes] + layer_nodes
+            layer_edges = [edges] + layer_edges
+            layer_edges_weight = [weights] + layer_weights
+            continue
+        walks = random_walk_with_start_prob(
+            graph, start_nodes, samples[layer_idx], proba=proba)
+        walks = [walk[1:] for walk in walks]
+        pred_edges = []
+        pred_weights = []
+        pred_nodes = []
+        for node, walk in zip(start_nodes, walks):
+            walk_nodes = []
+            walk_weights = []
+            count_sum = 0
+
+            for random_walk_node in walk:
+                if len(ignore_edge_set) > 0 and random_walk_node != node and \
+                    edge_hash(random_walk_node, node) in ignore_edge_set:
+                    continue
+                walk_nodes.append(random_walk_node)
+            unique, counts = np.unique(walk_nodes, return_counts=True)
+            frequencies = np.asarray((unique, counts)).T
+            frequencies = frequencies[np.argsort(frequencies[:, 1])]
+            frequencies = frequencies[-1 * top_k:, :]
+            for random_walk_node, random_count in zip(
+                    frequencies[:, 0].tolist(), frequencies[:, 1].tolist()):
+                pred_nodes.append(random_walk_node)
+                pred_edges.append((random_walk_node, node))
+                walk_weights.append(random_count)
+                count_sum += random_count
+            count_sum += len(walk_weights) * norm_bais
+            walk_weights = (np.array(walk_weights) + norm_bais) / (count_sum)
+            pred_weights.extend(walk_weights.tolist())
+        last_node_set = set(nodes)
+        nodes, edges, weights = flat_node_and_edge([nodes, pred_nodes], \
+            [edges, pred_edges], [weights, pred_weights])
+
+        layer_edges = [edges] + layer_edges
+        layer_weights = [weights] + layer_weights
+        layer_nodes = [nodes] + layer_nodes
+
+        start_nodes = list(set(nodes) - last_node_set)
+        start = time.time()
+
+    feed_dict = {}
+
+    subgraphs = []
+
+    for i in range(num_layers):
+        edge_feat_dict = {
+            "weight": np.array(
+                layer_weights[i], dtype='float32')
+        }
+        subgraphs.append(
+            graph.subgraph(
+                nodes=layer_nodes[0],
+                edges=layer_edges[i],
+                edge_feats=edge_feat_dict))
+        subgraphs[i].node_feat["index"] = np.array(
+            layer_nodes[0], dtype="int64")
+
+    return subgraphs