Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
梦想橡皮擦
Python 爬虫120例
提交
bbb6e2e1
Python 爬虫120例
项目概览
梦想橡皮擦
/
Python 爬虫120例
通知
6431
Star
763
Fork
392
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
Python 爬虫120例
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
提交
bbb6e2e1
编写于
12月 20, 2022
作者:
梦想橡皮擦
💬
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
复盘案例
上级
705acb4c
变更
5
隐藏空白更改
内联
并排
Showing
5 changed file
with
335 addition
and
0 deletion
+335
-0
复盘案例/句子网.py
复盘案例/句子网.py
+83
-0
复盘案例/可爱女人.py
复盘案例/可爱女人.py
+100
-0
复盘案例/站酷.py
复盘案例/站酷.py
+11
-0
复盘案例/黄鹤楼.py
复盘案例/黄鹤楼.py
+58
-0
案例17更新,群走网/句子网.py
案例17更新,群走网/句子网.py
+83
-0
未找到文件。
复盘案例/句子网.py
0 → 100644
浏览文件 @
bbb6e2e1
import
requests
from
lxml
import
etree
import
random
class
Spider16
:
def
__init__
(
self
):
self
.
wait_urls
=
[
"https://www.qunzou.com/xuexi/list_1_1.html"
]
self
.
url_template
=
"https://www.qunzou.com/xuexi/list_1_{num}.html"
self
.
details
=
[]
def
get_headers
(
self
):
uas
=
[
"Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)"
,
"Mozilla/5.0 (compatible; Baiduspider-render/2.0; +http://www.baidu.com/search/spider.html)"
,
"Baiduspider-image+(+http://www.baidu.com/search/spider.htm)"
,
"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 YisouSpider/5.0 Safari/537.36"
,
"Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
,
"Mozilla/5.0 (compatible; Googlebot-Image/1.0; +http://www.google.com/bot.html)"
,
"Sogou web spider/4.0(+http://www.sogou.com/docs/help/webmasters.htm#07)"
,
"Sogou News Spider/4.0(+http://www.sogou.com/docs/help/webmasters.htm#07)"
,
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0);"
,
"Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)"
,
"Sosospider+(+http://help.soso.com/webspider.htm)"
,
"Mozilla/5.0 (compatible; Yahoo! Slurp China; http://misc.yahoo.com.cn/help.html)"
]
ua
=
random
.
choice
(
uas
)
headers
=
{
"user-agent"
:
ua
,
"referer"
:
"https://www.baidu.com"
}
return
headers
# 生成待爬取页面
def
create_urls
(
self
):
headers
=
self
.
get_headers
()
page_url
=
self
.
wait_urls
[
0
]
res
=
requests
.
get
(
url
=
page_url
,
headers
=
headers
,
timeout
=
5
)
html
=
etree
.
HTML
(
res
.
text
)
# 提取总页码
last_page
=
html
.
xpath
(
"//span[@class='pageinfo']/strong[1]/text()"
)[
0
]
# 生成待爬取页面
for
i
in
range
(
1
,
int
(
last_page
)
+
1
):
self
.
wait_urls
.
append
(
self
.
url_template
.
format
(
num
=
i
))
def
get_html
(
self
):
for
url
in
self
.
wait_urls
:
headers
=
self
.
get_headers
()
res
=
requests
.
get
(
url
,
headers
=
headers
,
timeout
=
5
)
if
res
:
html
=
etree
.
HTML
(
res
.
text
)
detail_link_list
=
html
.
xpath
(
"//div[@class='list']//h6/a/@href"
)
for
d
in
detail_link_list
:
self
.
details
.
append
(
f
"https://www.qunzou.com
{
d
}
"
)
# 测试用,直接 return
return
def
get_detail
(
self
):
for
url
in
self
.
details
:
headers
=
self
.
get_headers
()
res
=
requests
.
get
(
url
,
headers
=
headers
,
timeout
=
5
)
res
.
encoding
=
"gb2312"
if
res
:
html
=
etree
.
HTML
(
res
.
text
)
sentences
=
html
.
xpath
(
"//div[@id='content']//p/text()"
)
# 打印句子
long_str
=
"
\n
"
.
join
(
sentences
)
print
(
long_str
)
# with open("sentences.txt", "a+", encoding="utf-8") as f:
# f.write(long_str)
def
run
(
self
):
self
.
create_urls
()
self
.
get_html
()
self
.
get_detail
()
if
__name__
==
'__main__'
:
s
=
Spider16
()
s
.
run
()
复盘案例/可爱女人.py
0 → 100644
浏览文件 @
bbb6e2e1
import
requests
import
re
import
threading
import
time
headers
=
{
"User-Agent"
:
"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36"
}
# 详情页图片地址 URL
detail_urls
=
[]
mutex
=
threading
.
Lock
()
# 循环获取URL
def
get_detail_urls
(
url
):
res
=
requests
.
get
(
url
=
url
,
headers
=
headers
)
res
.
encoding
=
'gb2312'
if
res
is
not
None
:
html
=
res
.
text
# 读取页面源码
# 对目标源码页数据进行裁剪
# 获取 ul class = "g-gxlist-imgbox" 的数据
# 该数据在标签 <ul class="g-gxlist-imgbox"> 和 <div class="pagelist"> 之间
html
=
html
[
html
.
find
(
'<ul class="g-gxlist-imgbox">'
):
html
.
find
(
'<div class="pagelist">'
)]
# 裁剪之后的数据,可以使用正则提取
# 设置正则表达式对象
pattern
=
re
.
compile
(
'<a href="(.*?)" target="_blank" title=".*?">'
)
# 提取详情页地址
find_urls
=
pattern
.
findall
(
html
)
if
find_urls
:
# 上锁
mutex
.
acquire
()
# 添加到全局变量中
detail_urls
.
extend
(
find_urls
)
# 释放锁
mutex
.
release
()
# 保存图片线程
def
save_image
():
global
detail_urls
while
True
:
# 上锁
mutex
.
acquire
()
if
len
(
detail_urls
)
>
0
:
# 获取列表第1项
img_url
=
detail_urls
[
0
]
# 删除列表第1项
del
detail_urls
[
0
]
# 释放锁
mutex
.
release
()
res
=
requests
.
get
(
url
=
img_url
,
headers
=
headers
)
if
res
is
not
None
:
html
=
res
.
text
# 裁切目标源码,便于后续整体提取
html
=
html
[
html
.
find
(
'<div class="img-list3">'
):
html
.
find
(
'<div class="m_ssxx">'
)]
pattern
=
re
.
compile
(
'<img alt=".*?" src="(.*?)" />'
)
img_list
=
pattern
.
findall
(
html
)
if
img_list
:
for
img
in
img_list
:
print
(
f
"线程
{
threading
.
currentThread
().
name
}
"
,
"抓取图片中:"
,
img
)
try
:
res
=
requests
.
get
(
img
)
with
open
(
f
"images/
{
threading
.
currentThread
().
name
+
str
(
time
.
time
())
}
.png"
,
"wb+"
)
as
f
:
f
.
write
(
res
.
content
)
except
Exception
as
e
:
print
(
e
)
else
:
print
(
"等待中,长时间等待,可以直接关闭"
)
if
__name__
==
'__main__'
:
# 生成分页地址
origin_url
=
[
'http://www.imeitou.com/nvsheng/'
]
for
i
in
range
(
2
,
11
):
origin_url
.
append
(
f
'http://www.imeitou.com/nvsheng/index_
{
i
}
.html'
)
# 获取图片详情页地址
for
d_url
in
origin_url
:
get_detail_urls
(
d_url
)
# 测试得到的详情页地址列表
# 测试得到 160 条地址,数据量是正确的
print
(
len
(
detail_urls
))
# 保存图片线程配置+启动
# 这里我们开启2个线程
save1
=
threading
.
Thread
(
target
=
save_image
)
save1
.
start
()
save2
=
threading
.
Thread
(
target
=
save_image
)
save2
.
start
()
复盘案例/站酷.py
0 → 100644
浏览文件 @
bbb6e2e1
# import requests
#
# response = requests.get("https://www.uisdc.com/archives")
# content = response.text
#
# with open("ca_demo.html", "w") as file:
# file.write(content)
import
urllib.parse
decoded
=
urllib
.
parse
.
unquote
(
"%3Ci+class%3D%22uname%22+title%3D%22%E4%BC%98%E7%A7%80%E7%BD%91%E9%A1%B5%E8%AE%BE%E8%AE%A1%22%3E%E4%BC%98%E7%A7%80%E7%BD%91%E9%A1%B5%E8%AE%BE%E8%AE%A1%3C%2Fi%3E"
)
print
(
decoded
)
\ No newline at end of file
复盘案例/黄鹤楼.py
0 → 100644
浏览文件 @
bbb6e2e1
import
threading
import
asyncio
import
time
import
requests
import
lxml
from
bs4
import
BeautifulSoup
async
def
get
(
url
):
return
requests
.
get
(
url
)
async
def
get_html
(
url
):
print
(
"准备抓取:"
,
url
)
res
=
await
get
(
url
)
return
res
.
text
async
def
save_img
(
img_url
):
print
(
"图片下载中:"
,
img_url
)
res
=
await
get
(
img_url
)
if
res
is
not
None
:
with
open
(
f
'./imgs/
{
time
.
time
()
}
.jpg'
,
'wb'
)
as
f
:
f
.
write
(
res
.
content
)
return
img_url
,
"ok"
async
def
main
(
url_list
):
# 创建 5 个任务
tasks
=
[
asyncio
.
ensure_future
(
get_html
(
url_list
[
_
]))
for
_
in
range
(
len
(
url_list
))]
dones
,
pending
=
await
asyncio
.
wait
(
tasks
)
for
task
in
dones
:
html
=
task
.
result
()
soup
=
BeautifulSoup
(
html
,
'lxml'
)
div_tag
=
soup
.
find
(
attrs
=
{
'class'
:
'lbox'
})
imgs
=
div_tag
.
find_all
(
'img'
)
for
img
in
imgs
:
ret
=
await
save_img
(
img
[
"data-original"
])
print
(
ret
)
if
__name__
==
'__main__'
:
# 修改为黄鹤楼,测试方便,仅使用10页
urls
=
[
f
"https://www.huanghelou.cc/category-44_
{
page
}
.html"
for
page
in
range
(
1
,
10
)]
totle_page
=
len
(
urls
)
//
5
if
len
(
urls
)
%
5
==
0
else
len
(
urls
)
//
5
+
1
# 对 urls 列表进行切片,方便采集
for
page
in
range
(
0
,
totle_page
):
start_page
=
0
if
page
==
0
else
page
*
5
end_page
=
(
page
+
1
)
*
5
# 循环事件对象
loop
=
asyncio
.
get_event_loop
()
loop
.
run_until_complete
(
main
(
urls
[
start_page
:
end_page
]))
案例17更新,群走网/句子网.py
0 → 100644
浏览文件 @
bbb6e2e1
import
requests
from
lxml
import
etree
import
random
class
Spider16
:
def
__init__
(
self
):
self
.
wait_urls
=
[
"https://www.qunzou.com/xuexi/list_1_1.html"
]
self
.
url_template
=
"https://www.qunzou.com/xuexi/list_1_{num}.html"
self
.
details
=
[]
def
get_headers
(
self
):
uas
=
[
"Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)"
,
"Mozilla/5.0 (compatible; Baiduspider-render/2.0; +http://www.baidu.com/search/spider.html)"
,
"Baiduspider-image+(+http://www.baidu.com/search/spider.htm)"
,
"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 YisouSpider/5.0 Safari/537.36"
,
"Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
,
"Mozilla/5.0 (compatible; Googlebot-Image/1.0; +http://www.google.com/bot.html)"
,
"Sogou web spider/4.0(+http://www.sogou.com/docs/help/webmasters.htm#07)"
,
"Sogou News Spider/4.0(+http://www.sogou.com/docs/help/webmasters.htm#07)"
,
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0);"
,
"Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)"
,
"Sosospider+(+http://help.soso.com/webspider.htm)"
,
"Mozilla/5.0 (compatible; Yahoo! Slurp China; http://misc.yahoo.com.cn/help.html)"
]
ua
=
random
.
choice
(
uas
)
headers
=
{
"user-agent"
:
ua
,
"referer"
:
"https://www.baidu.com"
}
return
headers
# 生成待爬取页面
def
create_urls
(
self
):
headers
=
self
.
get_headers
()
page_url
=
self
.
wait_urls
[
0
]
res
=
requests
.
get
(
url
=
page_url
,
headers
=
headers
,
timeout
=
5
)
html
=
etree
.
HTML
(
res
.
text
)
# 提取总页码
last_page
=
html
.
xpath
(
"//span[@class='pageinfo']/strong[1]/text()"
)[
0
]
# 生成待爬取页面
for
i
in
range
(
1
,
int
(
last_page
)
+
1
):
self
.
wait_urls
.
append
(
self
.
url_template
.
format
(
num
=
i
))
def
get_html
(
self
):
for
url
in
self
.
wait_urls
:
headers
=
self
.
get_headers
()
res
=
requests
.
get
(
url
,
headers
=
headers
,
timeout
=
5
)
if
res
:
html
=
etree
.
HTML
(
res
.
text
)
detail_link_list
=
html
.
xpath
(
"//div[@class='list']//h6/a/@href"
)
for
d
in
detail_link_list
:
self
.
details
.
append
(
f
"https://www.qunzou.com
{
d
}
"
)
# 测试用,直接 return
return
def
get_detail
(
self
):
for
url
in
self
.
details
:
headers
=
self
.
get_headers
()
res
=
requests
.
get
(
url
,
headers
=
headers
,
timeout
=
5
)
res
.
encoding
=
"gb2312"
if
res
:
html
=
etree
.
HTML
(
res
.
text
)
sentences
=
html
.
xpath
(
"//div[@id='content']//p/text()"
)
# 打印句子
long_str
=
"
\n
"
.
join
(
sentences
)
print
(
long_str
)
# with open("sentences.txt", "a+", encoding="utf-8") as f:
# f.write(long_str)
def
run
(
self
):
self
.
create_urls
()
self
.
get_html
()
self
.
get_detail
()
if
__name__
==
'__main__'
:
s
=
Spider16
()
s
.
run
()
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录