reduce memory of graph in multiprocess sampling

a580ad0c · liweibin · cfd11f59 · a580ad0c · a580ad0c
隐藏空白更改
内联并排

Showing with 45 addition and 62 deletion

pgl/graph.py pgl/graph.py +26 -30

pgl/graph_kernel.pyx pgl/graph_kernel.pyx +19 -32

未找到文件。
--- a/pgl/graph.py
+++ b/pgl/graph.py
@@ -43,8 +43,8 @@ class EdgeIndex(object):
    """

    def __init__(self, u, v, num_nodes):
-        self._v, self._eid, self._degree, self._sorted_u,\
-                self._sorted_v, self._sorted_eid = graph_kernel.build_index(u, v, num_nodes)
+        self._degree, self._sorted_v, self._sorted_u, \
+             self._sorted_eid, self._indptr = graph_kernel.build_index(u, v, num_nodes)

    @property
    def degree(self):
@@ -52,17 +52,25 @@ class EdgeIndex(object):
        """
        return self._degree

-    @property
-    def v(self):
-        """Return the compressed v.
+    def view_v(self, u=None):
+        """Return the compressed v for given u.
        """
-        return self._v
+        if u is None:
+            return np.split(self._sorted_v, self._indptr[1:])
+        else:
+            u = np.array(u, dtype="int64")
+            return graph_kernel.slice_by_index(
+                self._sorted_v, self._indptr, index=u)

-    @property
-    def eid(self):
-        """Return the edge id.
+    def view_eid(self, u=None):
+        """Return the compressed edge id for given u.
        """
-        return self._eid
+        if u is None:
+            return np.split(self._sorted_eid, self._indptr[1:])
+        else:
+            u = np.array(u, dtype="int64")
+            return graph_kernel.slice_by_index(
+                self._sorted_eid, self._indptr, index=u)

    def triples(self):
        """Return the sorted (u, v, eid) tuples.
@@ -287,17 +295,11 @@ class Graph(object):
                       []]

        """
-        if nodes is None:
-            if return_eids:
-                return self.adj_src_index.v, self.adj_src_index.eid
-            else:
-                return self.adj_src_index.v
+        if return_eids:
+            return self.adj_src_index.view_v(
+                nodes), self.adj_src_index.view_eid(nodes)
        else:
-            if return_eids:
-                return self.adj_src_index.v[nodes], self.adj_src_index.eid[
-                    nodes]
-            else:
-                return self.adj_src_index.v[nodes]
+            return self.adj_src_index.view_v(nodes)

    def sample_successor(self,
                         nodes,
@@ -385,17 +387,11 @@ class Graph(object):
                       [2]]

        """
-        if nodes is None:
-            if return_eids:
-                return self.adj_dst_index.v, self.adj_dst_index.eid
-            else:
-                return self.adj_dst_index.v
+        if return_eids:
+            return self.adj_dst_index.view_v(
+                nodes), self.adj_dst_index.view_eid(nodes)
        else:
-            if return_eids:
-                return self.adj_dst_index.v[nodes], self.adj_dst_index.eid[
-                    nodes]
-            else:
-                return self.adj_dst_index.v[nodes]
+            return self.adj_dst_index.view_v(nodes)

    def sample_predecessor(self,
                           nodes,

--- a/pgl/graph_kernel.pyx
+++ b/pgl/graph_kernel.pyx
@@ -53,14 +53,21 @@ def build_index(np.ndarray[np.int64_t, ndim=1] u,
            _tmp_eid[indptr[u[i]] + count[u[i]]] = i
            _tmp_u[indptr[u[i]] + count[u[i]]] = u[i]
            count[u[i]] += 1
+    return degree, _tmp_v, _tmp_u, _tmp_eid, indptr

-    cdef list output_eid = []
-    cdef list output_v = []
-    for i in xrange(n_size):
-        output_eid.append(_tmp_eid[indptr[i]:indptr[i+1]])
-        output_v.append(_tmp_v[indptr[i]:indptr[i+1]])
-    return np.array(output_v), np.array(output_eid), degree, _tmp_u, _tmp_v, _tmp_eid
-
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def slice_by_index(np.ndarray[np.int64_t, ndim=1] u,
+    np.ndarray[np.int64_t, ndim=1] indptr,
+    np.ndarray[np.int64_t, ndim=1] index):
+    cdef list output = []
+    cdef long long i
+    cdef long long h = len(index)
+    cdef long long j
+    for i in xrange(h):
+        j = index[i] 
+        output.append(u[indptr[j]:indptr[j+1]])
+    return np.array(output)

 @cython.boundscheck(False)
 @cython.wraparound(False)
@@ -253,22 +260,10 @@ def sample_subset_with_eid(list nids, list eids, long long maxdegree, shuffle=Fa

 @cython.boundscheck(False)
 @cython.wraparound(False)
-def skip_gram_gen_pair(vector[long long] walk_path, long win_size=5):
-    """Return node paris generated by skip-gram algorithm.
-
-    This function will auto remove the pair which src node is the same 
-    as dst node.
-
-    Args:
-        walk_path: List of nodes as a walk path.
-        win_size: the windows size used in skip-gram.
-
-    Return:
-        A tuple of (src node list, dst node list).
-    """
+def skip_gram_gen_pair(vector[long long] walk, long win_size=5):
    cdef vector[long long] src
    cdef vector[long long] dst
-    cdef long long l = len(walk_path)
+    cdef long long l = len(walk)
    cdef long long real_win_size, left, right, i
    cdef np.ndarray[np.int64_t, ndim=1] rnd = np.random.randint(1,  win_size+1,
                                    dtype=np.int64, size=l)
@@ -282,23 +277,15 @@ def skip_gram_gen_pair(vector[long long] walk_path, long win_size=5):
            if right >= l:
                right = l - 1
            for j in xrange(left, right+1):
-                if walk_path[i] == walk_path[j]:
+                if walk[i] == walk[j]:
                    continue
-                src.push_back(walk_path[i])
-                dst.push_back(walk_path[j])
+                src.push_back(walk[i])
+                dst.push_back(walk[j])
    return src, dst

 @cython.boundscheck(False)
 @cython.wraparound(False)
 def alias_sample_build_table(np.ndarray[np.float64_t, ndim=1] probs):
-    """Return the alias table and event table for alias sampling.
-
-    Args:
-        porobs: A list of float numbers as the probability.
-
-    Return:
-        A tuple of (alias table, event table).
-    """
    cdef long long l = len(probs)
    cdef np.ndarray[np.float64_t, ndim=1] alias = probs * l
    cdef np.ndarray[np.int64_t, ndim=1] events = np.zeros(l, dtype=np.int64)