Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
茵可露露
Python 爬虫120例
提交
22fe92d3
Python 爬虫120例
项目概览
茵可露露
/
Python 爬虫120例
与 Fork 源项目一致
Fork自
梦想橡皮擦 / Python 爬虫120例
通知
1
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
Python 爬虫120例
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
提交
22fe92d3
编写于
8月 31, 2021
作者:
梦想橡皮擦
💬
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
花容网,一派网,热门话题案例上传
上级
a1dbe14f
变更
4
展开全部
隐藏空白更改
内联
并排
Showing
4 changed file
with
41232 addition
and
0 deletion
+41232
-0
NO26/医美.py
NO26/医美.py
+94
-0
NO27/data.csv
NO27/data.csv
+40985
-0
NO27/一派数据采集.py
NO27/一派数据采集.py
+52
-0
NO27/话题广场.py
NO27/话题广场.py
+101
-0
未找到文件。
NO26/医美.py
0 → 100644
浏览文件 @
22fe92d3
import
requests
import
threading
from
queue
import
Queue
from
lxml
import
etree
import
time
import
random
# 初始化一个队列
q
=
Queue
(
maxsize
=
0
)
# 批量添加数据
for
page
in
range
(
1
,
4
):
q
.
put
(
'https://www.huaroo.net/d/pg_{}/'
.
format
(
page
))
# 获取头文件
def
get_headers
():
uas
=
[
"Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)"
,
"Mozilla/5.0 (compatible; Baiduspider-render/2.0; +http://www.baidu.com/search/spider.html)"
,
"Baiduspider-image+(+http://www.baidu.com/search/spider.htm)"
,
"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 YisouSpider/5.0 Safari/537.36"
,
"Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
,
"Mozilla/5.0 (compatible; Googlebot-Image/1.0; +http://www.google.com/bot.html)"
,
"Sogou web spider/4.0(+http://www.sogou.com/docs/help/webmasters.htm#07)"
,
"Sogou News Spider/4.0(+http://www.sogou.com/docs/help/webmasters.htm#07)"
,
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0);"
,
"Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)"
,
"Sosospider+(+http://help.soso.com/webspider.htm)"
,
"Mozilla/5.0 (compatible; Yahoo! Slurp China; http://misc.yahoo.com.cn/help.html)"
]
ua
=
random
.
choice
(
uas
)
headers
=
{
"user-agent"
:
ua
,
"referer"
:
"https://www.baidu.com"
}
return
headers
# 格式化数据
def
format
(
text
):
element
=
etree
.
HTML
(
text
)
# print(element)
article_list
=
element
.
xpath
(
'//div[contains(@class,"article_list")]'
)
# print(article_list)
wait_save_str
=
""
for
article
in
article_list
:
title
=
article
.
xpath
(
"./a/div/div[@class='article_title']/text()"
)[
0
].
strip
()
hospital
=
article
.
xpath
(
"./a/div/div[@class='hospital_list_content mt10 oh']/div[1]/text()"
)[
0
].
strip
()
duties
=
article
.
xpath
(
"./a/div/div[@class='hospital_list_content mt10 oh']/div[2]/text()"
)[
0
].
strip
()
practice
=
article
.
xpath
(
"./a/div/div[@class='hospital_list_content mt10 oh']/div[3]/text()"
)[
0
].
strip
()
project
=
article
.
xpath
(
"./a/div/div[@class='hospital_list_content mt10 oh']/div[4]/text()"
)[
0
].
strip
()
wait_save_str
+=
f
"
{
title
}
,
{
hospital
}
,
{
duties
}
,
{
practice
}
,
{
project
}
\n
"
save
(
wait_save_str
)
# 储存数据
def
save
(
wait_save_str
):
with
open
(
'./医美2.csv'
,
'a+'
,
encoding
=
'utf-8'
)
as
f
:
f
.
write
(
wait_save_str
)
print
(
wait_save_str
,
"---保存成功"
)
# 爬虫请求与解析入口
def
run
():
while
q
.
qsize
()
>
0
:
url
=
q
.
get
()
q
.
task_done
()
# print(url)
res
=
requests
.
get
(
url
=
url
,
headers
=
get_headers
(),
timeout
=
10
)
format
(
res
.
text
)
l
=
[]
for
i
in
range
(
2
):
t
=
threading
.
Thread
(
target
=
run
)
l
.
append
(
t
)
t
.
start
()
for
p
in
l
:
p
.
join
()
print
(
"多线程执行完毕"
)
q
.
join
()
print
(
"所有线程运行完毕"
)
NO27/data.csv
0 → 100644
浏览文件 @
22fe92d3
此差异已折叠。
点击以展开。
NO27/一派数据采集.py
0 → 100644
浏览文件 @
22fe92d3
import
requests
import
threading
from
queue
import
LifoQueue
import
time
import
random
# 初始化一个队列
q
=
LifoQueue
(
maxsize
=
0
)
# 批量添加数据
for
page
in
range
(
1
,
7
):
# https://sspai.com/api/v1/bullet/search/page/get?type=0&limit=10&offset=0&created_at=0
q
.
put
(
'https://sspai.com/api/v1/bullet/search/page/get?type=0&limit=10&offset={}&created_at=0'
.
format
((
page
-
1
)
*
10
))
def
get_headers
():
uas
=
[
"Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)"
,
"Mozilla/5.0 (compatible; Baiduspider-render/2.0; +http://www.baidu.com/search/spider.html)"
,
"Baiduspider-image+(+http://www.baidu.com/search/spider.htm)"
,
"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 YisouSpider/5.0 Safari/537.36"
,
"Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
,
"Mozilla/5.0 (compatible; Googlebot-Image/1.0; +http://www.google.com/bot.html)"
,
"Sogou web spider/4.0(+http://www.sogou.com/docs/help/webmasters.htm#07)"
,
"Sogou News Spider/4.0(+http://www.sogou.com/docs/help/webmasters.htm#07)"
,
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0);"
,
"Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)"
,
"Sosospider+(+http://help.soso.com/webspider.htm)"
,
"Mozilla/5.0 (compatible; Yahoo! Slurp China; http://misc.yahoo.com.cn/help.html)"
]
ua
=
random
.
choice
(
uas
)
headers
=
{
"user-agent"
:
ua
}
return
headers
# 储存数据
def
save
(
text
):
with
open
(
f
'
{
time
.
time
()
}
.json'
,
'a+'
,
encoding
=
'utf-8'
)
as
f
:
f
.
write
(
text
)
print
(
text
,
"--- 保存成功"
)
if
__name__
==
"__main__"
:
while
q
.
qsize
()
>
0
:
url
=
q
.
get
()
q
.
task_done
()
res
=
requests
.
get
(
url
=
url
,
headers
=
get_headers
(),
timeout
=
10
)
save
(
res
.
text
)
q
.
join
()
print
(
"所有任务都已完成"
)
NO27/话题广场.py
0 → 100644
浏览文件 @
22fe92d3
from
queue
import
Queue
import
time
import
threading
import
requests
from
lxml
import
etree
import
random
def
get_headers
():
uas
=
[
"Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)"
,
"Mozilla/5.0 (compatible; Baiduspider-render/2.0; +http://www.baidu.com/search/spider.html)"
,
"Baiduspider-image+(+http://www.baidu.com/search/spider.htm)"
,
"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 YisouSpider/5.0 Safari/537.36"
,
"Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
,
"Mozilla/5.0 (compatible; Googlebot-Image/1.0; +http://www.google.com/bot.html)"
,
"Sogou web spider/4.0(+http://www.sogou.com/docs/help/webmasters.htm#07)"
,
"Sogou News Spider/4.0(+http://www.sogou.com/docs/help/webmasters.htm#07)"
,
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0);"
,
"Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)"
,
"Sosospider+(+http://help.soso.com/webspider.htm)"
,
"Mozilla/5.0 (compatible; Yahoo! Slurp China; http://misc.yahoo.com.cn/help.html)"
]
ua
=
random
.
choice
(
uas
)
headers
=
{
"user-agent"
:
ua
}
return
headers
# 热门话题列表页待抓取链接
hot_subjects
=
Queue
(
maxsize
=
0
)
for
i
in
range
(
1
,
11
):
url
=
f
'https://www.jisilu.cn/topic/square/id-hot__feature_id-__page-
{
i
}
'
hot_subjects
.
put
(
url
)
# 初始化一个队列
q_data_ids
=
Queue
(
maxsize
=
0
)
# 生产者
def
producer
():
while
hot_subjects
.
qsize
()
>
0
:
list_url
=
hot_subjects
.
get
()
hot_subjects
.
task_done
()
print
(
"正在解析:"
,
list_url
)
# 获取分页地址
res
=
requests
.
get
(
list_url
,
headers
=
get_headers
(),
timeout
=
3
)
element
=
etree
.
HTML
(
res
.
text
)
data_ids
=
element
.
xpath
(
'//a[@class="aw-topic-name"]/@data-id'
)
for
data_id
in
data_ids
:
q_data_ids
.
put
(
data_id
)
# 消费者
def
consumer
():
while
True
:
# 取一个分类ID
data_id
=
q_data_ids
.
get
()
q_data_ids
.
task_done
()
if
data_id
is
None
:
break
start_page
=
1
url
=
f
'https://www.jisilu.cn/question/ajax/discuss/sort_type-new__topic_id-
{
data_id
}
__page-
{
start_page
}
'
res
=
requests
.
get
(
url
=
url
,
headers
=
get_headers
(),
timeout
=
5
)
text
=
res
.
text
while
len
(
text
)
>
0
:
url
=
f
'https://www.jisilu.cn/question/ajax/discuss/sort_type-new__topic_id-
{
data_id
}
__page-
{
start_page
}
'
res
=
requests
.
get
(
url
=
url
,
headers
=
get_headers
(),
timeout
=
5
)
print
(
res
.
url
)
text
=
res
.
text
start_page
+=
1
if
len
(
text
)
>
0
:
element
=
etree
.
HTML
(
res
.
text
)
titles
=
element
.
xpath
(
'//h4/a/text()'
)
urls
=
element
.
xpath
(
'//h4/a/@href'
)
names
=
element
.
xpath
(
'//a[@class="aw-user-name"]/text()'
)
data
=
zip
(
titles
,
names
,
urls
)
save_list
=
[
f
"
{
item
[
0
]
}
,
{
item
[
1
]
}
,
{
item
[
2
]
}
\n
"
for
item
in
data
]
long_str
=
""
.
join
(
save_list
)
with
open
(
"./data.csv"
,
"a+"
,
encoding
=
"utf-8"
)
as
f
:
f
.
write
(
long_str
)
# 开启2个生产者线程
for
p_in
in
range
(
1
,
3
):
p
=
threading
.
Thread
(
target
=
producer
)
p
.
start
()
# 开启2个消费者线程
for
p_in
in
range
(
1
,
2
):
p
=
threading
.
Thread
(
target
=
consumer
)
p
.
start
()
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录