Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
wuyelin21
爬虫100例(复盘中)
提交
3d15bb1e
爬
爬虫100例(复盘中)
项目概览
wuyelin21
/
爬虫100例(复盘中)
与 Fork 源项目一致
Fork自
梦想橡皮擦 / 爬虫100例(复盘中)
通知
1
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
爬
爬虫100例(复盘中)
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
3d15bb1e
编写于
7月 17, 2021
作者:
梦想橡皮擦
💬
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
站酷网用户爬虫
上级
5f0d5245
变更
1
隐藏空白更改
内联
并排
Showing
1 changed file
with
213 addition
and
0 deletion
+213
-0
案例3/站酷网用户爬虫.py
案例3/站酷网用户爬虫.py
+213
-0
未找到文件。
案例3/站酷网用户爬虫.py
0 → 100644
浏览文件 @
3d15bb1e
# -*- coding: UTF-8 -*-
import
requests
# 网络请求模块
import
random
# 随机模块
import
re
# 正则表达式模块
import
time
# 时间模块
import
threading
# 线程模块
import
pymongo
as
pm
# mongodb模块
class
Config
():
def
getHeaders
(
self
):
user_agent_list
=
[
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1"
"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11"
,
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6"
,
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6"
,
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1"
,
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5"
,
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5"
,
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3"
,
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3"
,
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3"
,
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3"
,
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3"
,
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3"
,
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3"
,
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3"
,
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3"
,
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
,
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
]
UserAgent
=
random
.
choice
(
user_agent_list
)
headers
=
{
'User-Agent'
:
UserAgent
}
return
headers
# 起始种子地址
urls
=
[
"https://douge2013.zcool.com.cn/follow?condition=0&p=1"
]
index
=
0
# 索引
g_lock
=
threading
.
Lock
()
# 初始化一个锁
# 获取连接
client
=
pm
.
MongoClient
(
'127.0.0.1'
,
27017
)
# 端口号是数值型
# 连接目标数据库
db
=
client
.
zcool
# 数据库用户验证
db
.
authenticate
(
"zcool"
,
"zcool"
)
get_index
=
0
# 生产者
class
Producer
(
threading
.
Thread
):
def
run
(
self
):
print
(
"线程启动..."
)
headers
=
Config
().
getHeaders
()
print
(
headers
)
global
urls
global
index
while
True
:
g_lock
.
acquire
()
if
len
(
urls
)
==
0
:
g_lock
.
release
()
continue
page_url
=
urls
.
pop
()
g_lock
.
release
()
# 使用完成之后及时把锁给释放,方便其他线程使用
response
=
""
try
:
response
=
requests
.
get
(
page_url
,
headers
=
headers
,
timeout
=
5
)
except
Exception
as
http
:
print
(
"生产者异常"
)
print
(
http
)
continue
content
=
response
.
text
# 如果是第一页,那么需要判断一下
# print(page_url)
is_home
=
re
.
search
(
r
'\&p\=(\d+?)'
,
page_url
).
group
(
1
)
if
is_home
==
str
(
1
):
# 这个正则表达式看起来比较怪异,学习一下即可,重点为匹配换行
pages
=
re
.
findall
(
r
'(\d+?)[.\s]*?<\/a>[.\s]*?<!\-\- 下一页 \-\->'
,
content
,
re
.
S
)
# 获取总页数
page_size
=
1
if
pages
:
page_size
=
int
(
max
(
pages
))
# 获取最大页数
if
page_size
>
1
:
# 如果最大页数大于1,那么获取所有的页面
url_arr
=
[]
threading_links_1
=
[]
for
page
in
range
(
2
,
page_size
+
1
):
url
=
re
.
sub
(
r
'\&p\=(\d+?)'
,
"&p="
+
str
(
page
),
page_url
)
threading_links_1
.
append
(
url
)
g_lock
.
acquire
()
index
+=
1
g_lock
.
release
()
url_arr
.
append
({
"index"
:
index
,
"link"
:
url
})
g_lock
.
acquire
()
urls
+=
threading_links_1
# URL数据添加
g_lock
.
release
()
try
:
db
.
text
.
insert_many
(
url_arr
,
ordered
=
False
)
except
Exception
as
e
:
print
(
"数据库输入异常"
)
print
(
e
)
continue
else
:
pass
else
:
pass
rc
=
re
.
compile
(
r
'<a href="(.*?)" title=".*?" class="avatar" target="_blank" z-st="member_content_card_1_user_face">'
)
follows
=
rc
.
findall
(
content
)
# print(follows)
fo_url
=
[]
threading_links_2
=
[]
for
u
in
follows
:
# 生成关注列表地址
this_url
=
"%s/follow?condition=0&p=1"
%
u
g_lock
.
acquire
()
index
+=
1
g_lock
.
release
()
fo_url
.
append
({
"index"
:
index
,
"link"
:
this_url
})
threading_links_2
.
append
(
this_url
)
g_lock
.
acquire
()
urls
+=
threading_links_2
g_lock
.
release
()
# print(len(fo_url))
try
:
db
.
text
.
insert_many
(
fo_url
,
ordered
=
False
)
except
:
continue
# 消费者类
class
Consumer
(
threading
.
Thread
):
def
run
(
self
):
headers
=
Config
().
getHeaders
()
global
get_index
while
True
:
g_lock
.
acquire
()
get_index
+=
1
g_lock
.
release
()
# 从刚才数据存储的列里面获取一条数据,这里用到find_one_and_delete方法
# get_index 需要声明成全局的变量
link
=
db
.
text
.
find_one_and_delete
({
"index"
:
get_index
})
page_url
=
""
if
link
:
page_url
=
link
[
"link"
]
print
(
page_url
+
">>>网页分析中..."
)
else
:
continue
response
=
""
try
:
response
=
requests
.
get
(
page_url
,
headers
=
headers
,
timeout
=
5
)
except
Exception
as
http
:
print
(
"消费者有异常"
)
print
(
http
)
continue
content
=
response
.
text
# rc = re.compile(r'divEditOperate_(?P<ID>\d*)[\"] .*>[\s\S]*?<p class=\"state\">.*?(?P<级别>\w*P).*</span></span>(?P<是否认证><br/>)?.*?</p>[\s\S]*?<div class=\"info clearfix\">[\s\S]*?<a class=\"imgBorder\" href=\"\/(?P<主页>.*?)\" hidefocus=\"true\">[\s\S]*?<img .*?src=\"(?P<头像>.*?)\".*?alt=\".*?\" title=\"(?P<昵称>.*?)\" />[\s\S]*?<p class=\"font12 lesserColor\">(?P<地点>.*?) .*?<span class=\"font12 mainColor\">(?P<粉丝数目>\d*?)</span>')
rc
=
re
.
compile
(
r
'<div class="author-info" data-id="(?P<ID>\d+?)" data-name="(?P<NAME>.*?)">'
)
user_info
=
rc
.
findall
(
content
)
print
(
">>>>>>>>>>>>>>>>>>>>"
)
users
=
[]
for
user
in
user_info
:
post
=
{
"id"
:
user
[
0
],
"name"
:
user
[
1
]
}
users
.
append
(
post
)
print
(
users
)
try
:
db
.
mkusers
.
insert_many
(
users
,
ordered
=
False
)
except
Exception
as
e
:
print
(
"数据库输入异常"
)
print
(
e
)
continue
time
.
sleep
(
1
)
print
(
"<<<<<<<<<<<<<<<<<<<<"
)
if
__name__
==
"__main__"
:
for
i
in
range
(
5
):
p
=
Producer
()
p
.
start
()
for
i
in
range
(
7
):
c
=
Consumer
()
c
.
start
()
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录