Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
愤怒的狗
deepwalk
提交
7ffd806e
D
deepwalk
项目概览
愤怒的狗
/
deepwalk
与 Fork 源项目一致
从无法访问的项目Fork
通知
1
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
D
deepwalk
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
前往新版Gitcode,体验更适合开发者的 AI 搜索 >>
提交
7ffd806e
编写于
9月 09, 2014
作者:
R
Rami Al-Rfou
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Adding parallel support to counting vertex frequency
上级
0a5ce370
变更
3
隐藏空白更改
内联
并排
Showing
3 changed file
with
49 addition
and
6 deletion
+49
-6
deepwalk/__main__.py
deepwalk/__main__.py
+20
-3
deepwalk/walks.py
deepwalk/walks.py
+28
-3
requirements.txt
requirements.txt
+1
-0
未找到文件。
deepwalk/__main__.py
浏览文件 @
7ffd806e
#! /usr/bin/env python
# -*- coding: utf-8 -*-
import
os
import
sys
import
random
from
io
import
open
...
...
@@ -16,6 +17,13 @@ from skipgram import Skipgram
from
six
import
text_type
as
unicode
from
six
import
iteritems
from
six.moves
import
range
import
psutil
from
multiprocessing
import
cpu_count
p
=
psutil
.
Process
(
os
.
getpid
())
p
.
set_cpu_affinity
(
list
(
range
(
cpu_count
())))
logger
=
logging
.
getLogger
(
__name__
)
LOGFORMAT
=
"%(asctime).19s %(levelname)s %(filename)s: %(lineno)s %(message)s"
...
...
@@ -68,11 +76,15 @@ def process(args):
path_length
=
args
.
walk_length
,
alpha
=
0
,
rand
=
random
.
Random
(
args
.
seed
),
num_workers
=
args
.
workers
)
# use degree distribution for frequency in tree
vertex_frequency
=
G
.
degree
(
nodes
=
G
.
iterkeys
())
print
(
"Counting vertex frequency..."
)
if
not
args
.
vertex_freq_degree
:
vertex_counts
=
serialized_walks
.
count_textfiles
(
walk_files
,
args
.
workers
)
else
:
# use degree distribution for frequency in tree
vertex_counts
=
G
.
degree
(
nodes
=
G
.
iterkeys
())
print
(
"Training..."
)
model
=
Skipgram
(
sentences
=
serialized_walks
.
combine_files_iter
(
walk_files
),
vocabulary_counts
=
vertex_
frequency
,
model
=
Skipgram
(
sentences
=
serialized_walks
.
combine_files_iter
(
walk_files
),
vocabulary_counts
=
vertex_
counts
,
size
=
args
.
representation_size
,
window
=
args
.
window_size
,
min_count
=
0
,
workers
=
args
.
workers
)
...
...
@@ -117,6 +129,11 @@ def main():
parser
.
add_argument
(
'--undirected'
,
default
=
True
,
type
=
bool
,
help
=
'Treat graph as undirected.'
)
parser
.
add_argument
(
'--vertex-freq-degree'
,
default
=
False
,
action
=
'store_true'
,
help
=
'Use vertex degree to estimate the frequency of nodes '
'in the random walks. This option is faster than '
'calculating the vocabulary.'
)
parser
.
add_argument
(
'--walk-length'
,
default
=
40
,
type
=
int
,
help
=
'Length of the random walk started at each node'
)
...
...
deepwalk/walks.py
浏览文件 @
7ffd806e
...
...
@@ -2,10 +2,12 @@ import logging
from
io
import
open
from
os
import
path
from
time
import
time
from
itertools
import
izip
from
multiprocessing
import
cpu_count
import
random
from
concurrent.futures
import
ProcessPoolExecutor
from
collections
import
Counter
from
six.moves
import
zip
from
deepwalk
import
graph
...
...
@@ -16,6 +18,29 @@ __current_graph = None
# speed up the string encoding
__vertex2str
=
None
def
count_words
(
file
):
""" Counts the word frequences in a list of sentences.
Note:
This is a helper function for parallel execution of `Vocabulary.from_text`
method.
"""
c
=
Counter
()
with
open
(
file
,
'r'
)
as
f
:
for
l
in
f
:
words
=
l
.
strip
().
split
()
c
.
update
(
words
)
return
c
def
count_textfiles
(
files
,
workers
=
1
):
c
=
Counter
()
with
ProcessPoolExecutor
(
max_workers
=
workers
)
as
executor
:
for
c_
in
executor
.
map
(
count_words
,
files
):
c
.
update
(
c_
)
return
c
def
count_lines
(
f
):
if
path
.
isfile
(
f
):
num_lines
=
sum
(
1
for
line
in
open
(
f
))
...
...
@@ -52,7 +77,7 @@ def write_walks_to_disk(G, filebase, num_paths, path_length, alpha=0, rand=rando
for
x
in
graph
.
grouper
(
int
(
num_paths
/
num_workers
)
+
1
,
range
(
1
,
num_paths
+
1
))]
with
ProcessPoolExecutor
(
max_workers
=
num_workers
)
as
executor
:
for
size
,
file_
,
ppw
in
i
zip
(
executor
.
map
(
count_lines
,
files_list
),
files_list
,
paths_per_worker
):
for
size
,
file_
,
ppw
in
zip
(
executor
.
map
(
count_lines
,
files_list
),
files_list
,
paths_per_worker
):
if
always_rebuild
or
size
!=
(
ppw
*
expected_size
):
args_list
.
append
((
ppw
,
path_length
,
alpha
,
random
.
Random
(
rand
.
randint
(
0
,
2
**
31
)),
file_
))
else
:
...
...
@@ -68,4 +93,4 @@ def combine_files_iter(file_list):
for
file
in
file_list
:
with
open
(
file
,
'r'
)
as
f
:
for
line
in
f
:
yield
line
.
split
()
\ No newline at end of file
yield
line
.
split
()
requirements.txt
浏览文件 @
7ffd806e
...
...
@@ -5,3 +5,4 @@ futures>=2.1.6
six
>=1.7.3
gensim
>=0.10.0
scipy
>=0.7.0
psutil
>=2.1.1
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录