compatibility with newest gensim and sklearn (#46)

Updating deepwalk to use latest versions of dependencies gensim and sklearn. * Compatability with newest gensim and sklearn. * Modify requirements.txt.

compatibility with newest gensim and sklearn (#46)
Updating deepwalk to use latest versions of dependencies gensim and sklearn. * Compatability with newest gensim and sklearn. * Modify requirements.txt.
8e8c06e8 · Haochen Chen · Bryan Perozzi · 96af38c3 · 8e8c06e8 · 8e8c06e8
6 changed file
--- a/deepwalk/__main__.py
+++ b/deepwalk/__main__.py
@@ -10,8 +10,8 @@ from collections import Counter
 from concurrent.futures import ProcessPoolExecutor
 import logging

-from deepwalk import graph
-from deepwalk import walks as serialized_walks
+import graph
+import walks as serialized_walks
 from gensim.models import Word2Vec
 from skipgram import Skipgram

@@ -72,7 +72,7 @@ def process(args):
    walks = graph.build_deepwalk_corpus(G, num_paths=args.number_walks,
                                        path_length=args.walk_length, alpha=0, rand=random.Random(args.seed))
    print("Training...")
-    model = Word2Vec(walks, size=args.representation_size, window=args.window_size, min_count=0, workers=args.workers)
+    model = Word2Vec(walks, size=args.representation_size, window=args.window_size, min_count=0, sg=1, hs=1, workers=args.workers)
  else:
    print("Data size {} is larger than limit (max-memory-data-size: {}).  Dumping walks to disk.".format(data_size, args.max_memory_data_size))
    print("Walking...")
@@ -90,11 +90,12 @@ def process(args):
      vertex_counts = G.degree(nodes=G.iterkeys())

    print("Training...")
-    model = Skipgram(sentences=serialized_walks.combine_files_iter(walk_files), vocabulary_counts=vertex_counts,
+    walks_corpus = serialized_walks.WalksCorpus(walk_files)
+    model = Skipgram(sentences=walks_corpus, vocabulary_counts=vertex_counts,
                     size=args.representation_size,
-                     window=args.window_size, min_count=0, workers=args.workers)
+                     window=args.window_size, min_count=0, trim_rule=None, workers=args.workers)

-  model.save_word2vec_format(args.output)
+  model.wv.save_word2vec_format(args.output)


 def main():

--- a/deepwalk/graph.py
+++ b/deepwalk/graph.py
@@ -148,7 +148,7 @@ class Graph(defaultdict):
          path.append(path[0])
      else:
        break
-    return path
+    return [str(node) for node in path]

 # TODO add build_walks in here


--- a/deepwalk/skipgram.py
+++ b/deepwalk/skipgram.py
@@ -16,47 +16,15 @@ class Skipgram(Word2Vec):

        self.vocabulary_counts = None

-        kwargs["min_count"] = kwargs.get("min_count", 1)
+        kwargs["min_count"] = kwargs.get("min_count", 0)
        kwargs["workers"] = kwargs.get("workers", cpu_count())
        kwargs["size"] = kwargs.get("size", 128)
        kwargs["sentences"] = kwargs.get("sentences", None)
+        kwaargs["window"] = kwargs.get("window", 10),
+        kwargs["sg"] = 1
+        kwargs["hs"] = 1

        if vocabulary_counts != None:
          self.vocabulary_counts = vocabulary_counts

        super(Skipgram, self).__init__(**kwargs)
-
-    def build_vocab(self, corpus):
-        """
-        Build vocabulary from a sequence of sentences or from a frequency dictionary, if one was provided.
-        """
-        if self.vocabulary_counts != None:
-          logger.debug("building vocabulary from provided frequency map")
-          vocab = self.vocabulary_counts
-        else:
-          logger.debug("default vocabulary building")
-          super(Skipgram, self).build_vocab(corpus)
-          return
-
-        # assign a unique index to each word
-        self.vocab, self.index2word = {}, []
-
-        for word, count in vocab.iteritems():
-            v = Vocab()
-            v.count = count
-            if v.count >= self.min_count:
-                v.index = len(self.vocab)
-                self.index2word.append(word)
-                self.vocab[word] = v
-
-        logger.debug("total %i word types after removing those with count<%s" % (len(self.vocab), self.min_count))
-
-        if self.hs:
-            # add info about each word's Huffman encoding
-            self.create_binary_tree()
-        if self.negative:
-            # build the table for drawing random words (for negative sampling)
-            self.make_table()
-        # precalculate downsampling thresholds
-        self.precalc_sampling()
-        self.reset_weights()
\ No newline at end of file
--- a/deepwalk/walks.py
+++ b/deepwalk/walks.py
@@ -9,7 +9,7 @@ from collections import Counter

 from six.moves import zip

-from deepwalk import graph
+import graph

 logger = logging.getLogger("deepwalk")

@@ -55,16 +55,14 @@ def _write_walks_to_disk(args):
  with open(f, 'w') as fout:
    for walk in graph.build_deepwalk_corpus_iter(G=G, num_paths=num_paths, path_length=path_length,
                                                 alpha=alpha, rand=rand):
-      fout.write(u"{}\n".format(u" ".join(__vertex2str[v] for v in walk)))
+      fout.write(u"{}\n".format(u" ".join(v for v in walk)))
  logger.debug("Generated new file {}, it took {} seconds".format(f, time() - t_0))
  return f

 def write_walks_to_disk(G, filebase, num_paths, path_length, alpha=0, rand=random.Random(0), num_workers=cpu_count(),
                        always_rebuild=True):
  global __current_graph
-  global __vertex2str
  __current_graph = G
-  __vertex2str = {v:str(v) for v in G.nodes()}
  files_list = ["{}.{}".format(filebase, str(x)) for x in xrange(num_paths)]
  expected_size = len(G)
  args_list = []
@@ -89,6 +87,15 @@ def write_walks_to_disk(G, filebase, num_paths, path_length, alpha=0, rand=rando

  return files

+class WalksCorpus(object):
+  def __init__(self, file_list):
+    self.file_list = file_list
+  def __iter__(self):
+    for file in self.file_list:
+      with open(file, 'r') as f:
+        for line in f:
+          yield line.split()
+
 def combine_files_iter(file_list):
  for file in file_list:
    with open(file, 'r') as f:

--- a/example_graphs/scoring.py
+++ b/example_graphs/scoring.py
@@ -4,7 +4,6 @@

 __author__      = "Bryan Perozzi"

-
 import numpy
 from sklearn.multiclass import OneVsRestClassifier
 from sklearn.linear_model import LogisticRegression
@@ -12,9 +11,10 @@ from itertools import izip
 from sklearn.metrics import f1_score
 from scipy.io import loadmat
 from sklearn.utils import shuffle as skshuffle
+from sklearn.preprocessing import MultiLabelBinarizer

 from collections import defaultdict
-from gensim.models import Word2Vec
+from gensim.models import Word2Vec, KeyedVectors

 class TopKRanker(OneVsRestClassifier):
    def predict(self, X, top_k_list):
@@ -39,14 +39,15 @@ embeddings_file = "blogcatalog.embeddings"
 matfile = "blogcatalog.mat"

 # 1. Load Embeddings
-model = Word2Vec.load_word2vec_format(embeddings_file, binary=False,
-                                      norm_only=False)
+model = KeyedVectors.load_word2vec_format(embeddings_file, binary=False)

 # 2. Load labels
 mat = loadmat(matfile)
 A = mat['network']
 graph = sparse2graph(A)
 labels_matrix = mat['group']
+labels_count = labels_matrix.shape[1]
+mlb = MultiLabelBinarizer(xrange(labels_count))

 # Map nodes to their features (note:  assumes nodes are labeled as integers 1:N)
 features_matrix = numpy.asarray([model[str(node)] for node in range(len(graph))])
@@ -85,23 +86,23 @@ for train_percent in training_percents:
    X_test = X[training_size:, :]
    y_test_ = y[training_size:]

-    y_test = [[] for x in xrange(y_test_.shape[0])]
+    y_test = [[] for _ in xrange(y_test_.shape[0])]

    cy =  y_test_.tocoo()
    for i, j in izip(cy.row, cy.col):
        y_test[i].append(j)

    clf = TopKRanker(LogisticRegression())
-    clf.fit(X_train, y_train)
+    clf.fit(X_train, y_train_)

    # find out how many labels should be predicted
    top_k_list = [len(l) for l in y_test]
    preds = clf.predict(X_test, top_k_list)

    results = {}
-    averages = ["micro", "macro", "samples", "weighted"]
+    averages = ["micro", "macro"]
    for average in averages:
-        results[average] = f1_score(y_test,  preds, average=average)
+        results[average] = f1_score(mlb.fit_transform(y_test), mlb.fit_transform(preds), average=average)

    all_results[train_percent].append(results)


--- a/requirements.txt
+++ b/requirements.txt
@@ -3,6 +3,6 @@ Cython>=0.20.2
 argparse>=1.2.1
 futures>=2.1.6
 six>=1.7.3
-gensim==0.10.2
-scipy>=0.7.0
+gensim>=1.0.0
+scipy>=0.15.0
 psutil>=2.1.1