提交 8e8c06e8 编写于 作者: H Haochen Chen 提交者: Bryan Perozzi

compatibility with newest gensim and sklearn (#46)

Updating deepwalk to use latest versions of dependencies gensim and sklearn.

* Compatability with newest gensim and sklearn.
* Modify requirements.txt.
上级 96af38c3
......@@ -10,8 +10,8 @@ from collections import Counter
from concurrent.futures import ProcessPoolExecutor
import logging
from deepwalk import graph
from deepwalk import walks as serialized_walks
import graph
import walks as serialized_walks
from gensim.models import Word2Vec
from skipgram import Skipgram
......@@ -72,7 +72,7 @@ def process(args):
walks = graph.build_deepwalk_corpus(G, num_paths=args.number_walks,
path_length=args.walk_length, alpha=0, rand=random.Random(args.seed))
print("Training...")
model = Word2Vec(walks, size=args.representation_size, window=args.window_size, min_count=0, workers=args.workers)
model = Word2Vec(walks, size=args.representation_size, window=args.window_size, min_count=0, sg=1, hs=1, workers=args.workers)
else:
print("Data size {} is larger than limit (max-memory-data-size: {}). Dumping walks to disk.".format(data_size, args.max_memory_data_size))
print("Walking...")
......@@ -90,11 +90,12 @@ def process(args):
vertex_counts = G.degree(nodes=G.iterkeys())
print("Training...")
model = Skipgram(sentences=serialized_walks.combine_files_iter(walk_files), vocabulary_counts=vertex_counts,
walks_corpus = serialized_walks.WalksCorpus(walk_files)
model = Skipgram(sentences=walks_corpus, vocabulary_counts=vertex_counts,
size=args.representation_size,
window=args.window_size, min_count=0, workers=args.workers)
window=args.window_size, min_count=0, trim_rule=None, workers=args.workers)
model.save_word2vec_format(args.output)
model.wv.save_word2vec_format(args.output)
def main():
......
......@@ -148,7 +148,7 @@ class Graph(defaultdict):
path.append(path[0])
else:
break
return path
return [str(node) for node in path]
# TODO add build_walks in here
......
......@@ -16,47 +16,15 @@ class Skipgram(Word2Vec):
self.vocabulary_counts = None
kwargs["min_count"] = kwargs.get("min_count", 1)
kwargs["min_count"] = kwargs.get("min_count", 0)
kwargs["workers"] = kwargs.get("workers", cpu_count())
kwargs["size"] = kwargs.get("size", 128)
kwargs["sentences"] = kwargs.get("sentences", None)
kwaargs["window"] = kwargs.get("window", 10),
kwargs["sg"] = 1
kwargs["hs"] = 1
if vocabulary_counts != None:
self.vocabulary_counts = vocabulary_counts
super(Skipgram, self).__init__(**kwargs)
def build_vocab(self, corpus):
"""
Build vocabulary from a sequence of sentences or from a frequency dictionary, if one was provided.
"""
if self.vocabulary_counts != None:
logger.debug("building vocabulary from provided frequency map")
vocab = self.vocabulary_counts
else:
logger.debug("default vocabulary building")
super(Skipgram, self).build_vocab(corpus)
return
# assign a unique index to each word
self.vocab, self.index2word = {}, []
for word, count in vocab.iteritems():
v = Vocab()
v.count = count
if v.count >= self.min_count:
v.index = len(self.vocab)
self.index2word.append(word)
self.vocab[word] = v
logger.debug("total %i word types after removing those with count<%s" % (len(self.vocab), self.min_count))
if self.hs:
# add info about each word's Huffman encoding
self.create_binary_tree()
if self.negative:
# build the table for drawing random words (for negative sampling)
self.make_table()
# precalculate downsampling thresholds
self.precalc_sampling()
self.reset_weights()
\ No newline at end of file
......@@ -9,7 +9,7 @@ from collections import Counter
from six.moves import zip
from deepwalk import graph
import graph
logger = logging.getLogger("deepwalk")
......@@ -55,16 +55,14 @@ def _write_walks_to_disk(args):
with open(f, 'w') as fout:
for walk in graph.build_deepwalk_corpus_iter(G=G, num_paths=num_paths, path_length=path_length,
alpha=alpha, rand=rand):
fout.write(u"{}\n".format(u" ".join(__vertex2str[v] for v in walk)))
fout.write(u"{}\n".format(u" ".join(v for v in walk)))
logger.debug("Generated new file {}, it took {} seconds".format(f, time() - t_0))
return f
def write_walks_to_disk(G, filebase, num_paths, path_length, alpha=0, rand=random.Random(0), num_workers=cpu_count(),
always_rebuild=True):
global __current_graph
global __vertex2str
__current_graph = G
__vertex2str = {v:str(v) for v in G.nodes()}
files_list = ["{}.{}".format(filebase, str(x)) for x in xrange(num_paths)]
expected_size = len(G)
args_list = []
......@@ -89,6 +87,15 @@ def write_walks_to_disk(G, filebase, num_paths, path_length, alpha=0, rand=rando
return files
class WalksCorpus(object):
def __init__(self, file_list):
self.file_list = file_list
def __iter__(self):
for file in self.file_list:
with open(file, 'r') as f:
for line in f:
yield line.split()
def combine_files_iter(file_list):
for file in file_list:
with open(file, 'r') as f:
......
......@@ -4,7 +4,6 @@
__author__ = "Bryan Perozzi"
import numpy
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
......@@ -12,9 +11,10 @@ from itertools import izip
from sklearn.metrics import f1_score
from scipy.io import loadmat
from sklearn.utils import shuffle as skshuffle
from sklearn.preprocessing import MultiLabelBinarizer
from collections import defaultdict
from gensim.models import Word2Vec
from gensim.models import Word2Vec, KeyedVectors
class TopKRanker(OneVsRestClassifier):
def predict(self, X, top_k_list):
......@@ -39,14 +39,15 @@ embeddings_file = "blogcatalog.embeddings"
matfile = "blogcatalog.mat"
# 1. Load Embeddings
model = Word2Vec.load_word2vec_format(embeddings_file, binary=False,
norm_only=False)
model = KeyedVectors.load_word2vec_format(embeddings_file, binary=False)
# 2. Load labels
mat = loadmat(matfile)
A = mat['network']
graph = sparse2graph(A)
labels_matrix = mat['group']
labels_count = labels_matrix.shape[1]
mlb = MultiLabelBinarizer(xrange(labels_count))
# Map nodes to their features (note: assumes nodes are labeled as integers 1:N)
features_matrix = numpy.asarray([model[str(node)] for node in range(len(graph))])
......@@ -85,23 +86,23 @@ for train_percent in training_percents:
X_test = X[training_size:, :]
y_test_ = y[training_size:]
y_test = [[] for x in xrange(y_test_.shape[0])]
y_test = [[] for _ in xrange(y_test_.shape[0])]
cy = y_test_.tocoo()
for i, j in izip(cy.row, cy.col):
y_test[i].append(j)
clf = TopKRanker(LogisticRegression())
clf.fit(X_train, y_train)
clf.fit(X_train, y_train_)
# find out how many labels should be predicted
top_k_list = [len(l) for l in y_test]
preds = clf.predict(X_test, top_k_list)
results = {}
averages = ["micro", "macro", "samples", "weighted"]
averages = ["micro", "macro"]
for average in averages:
results[average] = f1_score(y_test, preds, average=average)
results[average] = f1_score(mlb.fit_transform(y_test), mlb.fit_transform(preds), average=average)
all_results[train_percent].append(results)
......
......@@ -3,6 +3,6 @@ Cython>=0.20.2
argparse>=1.2.1
futures>=2.1.6
six>=1.7.3
gensim==0.10.2
scipy>=0.7.0
gensim>=1.0.0
scipy>=0.15.0
psutil>=2.1.1
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册