Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
幻灰龙
CSDN 统一标签设计
提交
d690c9c5
C
CSDN 统一标签设计
项目概览
幻灰龙
/
CSDN 统一标签设计
与 Fork 源项目一致
Fork自
CSDN 技术社区 / CSDN 统一标签设计
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
C
CSDN 统一标签设计
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
d690c9c5
编写于
4月 20, 2021
作者:
F
feilong
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
添加cnblogs标签数据集
上级
47043122
变更
6
展开全部
隐藏空白更改
内联
并排
Showing
6 changed file
with
31333 addition
and
13075 deletion
+31333
-13075
.gitignore
.gitignore
+2
-1
src/dataset/cnblogs.tag.json
src/dataset/cnblogs.tag.json
+31168
-0
src/dataset/segmentfault.tag.json
src/dataset/segmentfault.tag.json
+1
-13074
src/main.py
src/main.py
+6
-0
src/tag_source/cnblogs.py
src/tag_source/cnblogs.py
+80
-0
src/tag_source/infoq.py
src/tag_source/infoq.py
+76
-0
未找到文件。
.gitignore
浏览文件 @
d690c9c5
__pycache__
\ No newline at end of file
__pycache__
test.html
\ No newline at end of file
src/dataset/cnblogs.tag.json
0 → 100644
浏览文件 @
d690c9c5
此差异已折叠。
点击以展开。
src/dataset/segmentfault.tag.json
浏览文件 @
d690c9c5
此差异已折叠。
点击以展开。
src/main.py
浏览文件 @
d690c9c5
...
...
@@ -2,6 +2,8 @@ import click
import
tag_source.vscode
import
tag_source.stackoverflow
import
tag_source.segmentfault
import
tag_source.infoq
import
tag_source.cnblogs
@
click
.
command
()
@
click
.
option
(
'--source'
)
...
...
@@ -13,6 +15,10 @@ def fetch(source):
tag_source
.
stackoverflow
.
fetch
()
elif
source
==
'sf'
:
tag_source
.
segmentfault
.
fetch
()
elif
source
==
'infoq'
:
tag_source
.
infoq
.
fetch
()
elif
source
==
'cnblogs'
:
tag_source
.
cnblogs
.
fetch
()
if
__name__
==
'__main__'
:
fetch
()
\ No newline at end of file
src/tag_source/cnblogs.py
0 → 100644
浏览文件 @
d690c9c5
import
os
import
json
import
urllib.request
from
scrapy.selector
import
Selector
from
scrapy.http
import
HtmlResponse
import
scrapy
from
scrapy.crawler
import
CrawlerProcess
from
scrapy.settings
import
Settings
class
CNBlogTagSpider
(
scrapy
.
Spider
):
name
=
"cnblogs_tags"
allowed_domains
=
[
"cnblogs.com"
]
start_urls
=
[
'https://q.cnblogs.com/tag/list?pageindex=1'
]
custom_settings
=
{
'ITEM_PIPELINES'
:{
'tag_source.cnblogs.TagPipeline'
:
301
},
'LOG_LEVEL'
:
'INFO'
}
def
__init__
(
self
):
self
.
page_count
=
0
self
.
totgal_pages
=
520
def
parse
(
self
,
response
):
# with open('test.html', 'w') as f:
# f.write(response.text)
self
.
page_count
+=
1
tag_div
=
response
.
css
(
'.tag-div'
)
# print(tag_div)
tags
=
tag_div
.
xpath
(
'div/table/tr/td'
)
# print('==>',tags)
for
tag
in
tags
:
name
=
tag
.
xpath
(
'li/a/text()'
).
get
()
star
=
tag
.
xpath
(
'li/text()'
).
get
()
star
=
star
[
1
:
len
(
star
)
-
1
]
yield
{
'name'
:
name
,
'star'
:
star
}
if
self
.
page_count
<
self
.
totgal_pages
:
next_page_list
=
response
.
css
(
'#pager>a'
)
if
len
(
next_page_list
)
>
0
:
next_page_item
=
next_page_list
[
len
(
next_page_list
)
-
1
]
next_page
=
next_page_item
.
css
(
'::attr(href)'
).
get
()
print
(
'next_page:'
,
next_page
)
yield
response
.
follow
(
next_page
,
callback
=
self
.
parse
,
dont_filter
=
True
)
class
TagPipeline
(
object
):
def
open_spider
(
self
,
spider
):
self
.
file
=
open
(
'dataset/cnblogs.tag.json'
,
'w'
)
self
.
file
.
write
(
'[
\n
'
)
self
.
count
=
0
self
.
tags
=
{}
def
close_spider
(
self
,
spider
):
self
.
file
.
write
(
'
\n
]'
)
self
.
file
.
close
()
def
process_item
(
self
,
item
,
spider
):
if
self
.
tags
.
get
(
item
[
'name'
])
is
not
None
:
return
self
.
tags
[
item
[
'name'
]]
=
True
words
=
[]
if
self
.
count
>
0
:
words
.
append
(
',
\n
'
)
words
.
append
(
' '
)
words
.
append
(
json
.
dumps
(
item
,
ensure_ascii
=
False
).
strip
())
line
=
''
.
join
(
words
)
self
.
file
.
write
(
line
)
self
.
count
+=
1
def
fetch
():
settings
=
Settings
()
process
=
CrawlerProcess
()
process
.
crawl
(
CNBlogTagSpider
)
process
.
start
()
\ No newline at end of file
src/tag_source/infoq.py
0 → 100644
浏览文件 @
d690c9c5
import
os
import
json
import
urllib.request
from
scrapy.selector
import
Selector
from
scrapy.http
import
HtmlResponse
import
scrapy
from
scrapy.crawler
import
CrawlerProcess
from
scrapy.settings
import
Settings
class
InfoQSpider
(
scrapy
.
Spider
):
name
=
"infoq_tags"
allowed_domains
=
[
"infoq.cn"
]
start_urls
=
[
'https://www.infoq.cn/topics'
]
custom_settings
=
{
'ITEM_PIPELINES'
:{
'tag_source.infoq.TagPipeline'
:
301
},
'LOG_LEVEL'
:
'INFO'
}
def
__init__
(
self
):
self
.
page_count
=
0
self
.
totgal_pages
=
654
def
parse
(
self
,
response
):
self
.
page_count
+=
1
print
(
response
.
body
)
tags
=
response
.
css
(
'.navigation-list'
)
print
(
tags
)
for
tag
in
tags
:
name
=
tag
.
xpath
(
'h2/a/text()'
).
get
()
desc
=
tag
.
xpath
(
'p/text()'
).
get
()
star
=
tag
.
xpath
(
'div/strong/text()'
).
get
()
yield
{
'name'
:
name
,
'desc'
:
desc
,
'star'
:
star
}
next_page_list
=
response
.
css
(
'.next'
)
if
len
(
next_page_list
)
>
0
:
next_page_item
=
next_page_list
[
len
(
next_page_list
)
-
1
]
next_page
=
next_page_item
.
css
(
'a::attr(href)'
).
get
()
print
(
'next_page:'
,
next_page
)
yield
response
.
follow
(
next_page
,
callback
=
self
.
parse
,
dont_filter
=
True
)
class
TagPipeline
(
object
):
def
open_spider
(
self
,
spider
):
self
.
file
=
open
(
'dataset/segmentfault.tag.json'
,
'w'
)
self
.
file
.
write
(
'[
\n
'
)
self
.
count
=
0
self
.
tags
=
{}
def
close_spider
(
self
,
spider
):
self
.
file
.
write
(
'
\n
]'
)
self
.
file
.
close
()
def
process_item
(
self
,
item
,
spider
):
if
self
.
tags
.
get
(
item
[
'name'
])
is
not
None
:
return
self
.
tags
[
item
[
'name'
]]
=
True
words
=
[]
if
self
.
count
>
0
:
words
.
append
(
',
\n
'
)
words
.
append
(
' '
)
words
.
append
(
json
.
dumps
(
item
,
ensure_ascii
=
False
).
strip
())
line
=
''
.
join
(
words
)
self
.
file
.
write
(
line
)
self
.
count
+=
1
def
fetch
():
settings
=
Settings
()
process
=
CrawlerProcess
()
process
.
crawl
(
InfoQSpider
)
process
.
start
()
\ No newline at end of file
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录