Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
茵可露露
Python 爬虫120例
提交
ecc073a0
Python 爬虫120例
项目概览
茵可露露
/
Python 爬虫120例
与 Fork 源项目一致
Fork自
梦想橡皮擦 / Python 爬虫120例
通知
1
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
Python 爬虫120例
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
提交
ecc073a0
编写于
10月 14, 2021
作者:
梦想橡皮擦
💬
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
PNG IMG 采集
上级
4a7834c2
变更
3
隐藏空白更改
内联
并排
Showing
3 changed file
with
140 addition
and
1 deletion
+140
-1
NO36/imgs/Thread E_vaccine_PNG70.png
NO36/imgs/Thread E_vaccine_PNG70.png
+0
-0
NO36/免抠图片下载.py
NO36/免抠图片下载.py
+139
-0
README.md
README.md
+1
-1
未找到文件。
NO36/imgs/Thread E_vaccine_PNG70.png
0 → 100644
浏览文件 @
ecc073a0
16.0 KB
NO36/免抠图片下载.py
0 → 100644
浏览文件 @
ecc073a0
import
random
import
logging
import
threading
from
typing
import
Optional
,
Text
import
requests
from
bs4
import
BeautifulSoup
import
lxml
logging
.
basicConfig
(
level
=
logging
.
WARNING
)
thread_lock
=
threading
.
Lock
()
class
PngImg
(
threading
.
Thread
):
# 构造函数
def
__init__
(
self
,
thread_name
,
headers_func
,
requests_func
)
->
None
:
threading
.
Thread
.
__init__
(
self
)
self
.
_headers
=
headers_func
()
self
.
_timeout
=
5
self
.
requests_func
=
requests_func
self
.
_thread_name
=
thread_name
def
run
(
self
)
->
None
:
bast_host
=
"http://pngimg.com"
while
True
:
# 全局锁,获取地址
thread_lock
.
acquire
()
global
all_links
if
all_links
is
None
:
break
list_url
=
bast_host
+
all_links
.
pop
().
get
(
'href'
)
thread_lock
.
release
()
print
(
self
.
_thread_name
+
" 正在运行,采集的地址是 "
+
list_url
)
list_html_str
=
self
.
requests_func
(
url
=
list_url
,
headers
=
self
.
_headers
,
timeout
=
self
.
_timeout
)
ret_imgs
=
self
.
_get_imgs
(
list_html_str
)
self
.
_save
(
ret_imgs
)
def
_get_imgs
(
self
,
html
)
->
list
:
"""获取所有的图片地址
:return: 图片 list
"""
soup
=
BeautifulSoup
(
html
,
'lxml'
)
# 获取图片所在 div 标签
div_imgs
=
soup
.
find_all
(
attrs
=
{
'class'
:
'png_imgs'
})
# 图片地址为空,用来保存图片 tag
imgs_src
=
[]
for
div_img
in
div_imgs
:
# 遍历 div 标签,检索后代标签中的 img 图片标签
imgs_src
.
append
(
div_img
.
a
.
img
.
get
(
"src"
))
return
imgs_src
def
_save
(
self
,
imgs
):
"""保存图片 """
for
img
in
imgs
:
img
=
img
.
replace
(
'small/'
,
''
)
# 去除 small 标记,获取大图
img_url
=
"https://pngimg.com{}"
.
format
(
img
)
# 拼接完整图片访问地址
name
=
img
[
img
.
rfind
(
'/'
)
+
1
:]
# print(img_url)
# print(name)
try
:
res
=
requests
.
get
(
url
=
img_url
,
headers
=
self
.
_headers
,
timeout
=
self
.
_timeout
)
except
Exception
as
e
:
logging
.
error
(
e
)
if
res
is
not
None
:
name
=
name
.
replace
(
"/"
,
"_"
)
with
open
(
f
'./imgs/
{
self
.
_thread_name
}
_
{
name
}
'
,
"wb+"
)
as
f
:
f
.
write
(
res
.
content
)
def
get_headers
():
uas
=
[
"Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)"
,
"Mozilla/5.0 (compatible; Baiduspider-render/2.0; +http://www.baidu.com/search/spider.html)"
,
"Baiduspider-image+(+http://www.baidu.com/search/spider.htm)"
,
"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 YisouSpider/5.0 Safari/537.36"
,
"Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
,
"Mozilla/5.0 (compatible; Googlebot-Image/1.0; +http://www.google.com/bot.html)"
,
"Sogou web spider/4.0(+http://www.sogou.com/docs/help/webmasters.htm#07)"
,
"Sogou News Spider/4.0(+http://www.sogou.com/docs/help/webmasters.htm#07)"
,
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0);"
,
"Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)"
,
"Sosospider+(+http://help.soso.com/webspider.htm)"
,
"Mozilla/5.0 (compatible; Yahoo! Slurp China; http://misc.yahoo.com.cn/help.html)"
]
ua
=
random
.
choice
(
uas
)
headers
=
{
"user-agent"
:
ua
}
return
headers
# 通用的 requests get 请求方法
def
get_html
(
url
:
Text
,
headers
:
dict
,
timeout
:
int
)
->
Optional
[
Text
]:
try
:
res
=
requests
.
get
(
url
=
url
,
headers
=
headers
,
timeout
=
timeout
)
except
Exception
as
e
:
logging
.
error
(
e
)
if
res
is
not
None
:
return
res
.
text
else
:
return
None
if
__name__
==
'__main__'
:
url
=
"http://pngimg.com/"
headers
=
get_headers
()
# 获取首页的 HTML 数据
html_str
=
get_html
(
url
,
headers
,
5
)
# 解析首页的HTML数据,获取所有列表页链接
soup
=
BeautifulSoup
(
html_str
,
'lxml'
)
div_parents
=
soup
.
find_all
(
attrs
=
{
'class'
:
'sub_category'
})
# 获取到所有的详情页地址
all_links
=
[]
for
div
in
div_parents
:
all_links
.
extend
(
div
.
find_all
(
'a'
))
print
(
"累计获取到"
,
len
(
all_links
),
"个列表页数据"
)
# 通过第一个地址进行测试
# first_url = all_links[0]
#
# list_url = first_url.get('href')
# bast_host = "http://pngimg.com"
# real_url = bast_host + list_url
threads
=
[
"Thread A"
,
"Thread B"
,
"Thread C"
,
"Thread D"
,
"Thread E"
]
for
t
in
threads
:
my_thread
=
PngImg
(
t
,
get_headers
,
get_html
)
my_thread
.
start
()
README.md
浏览文件 @
ecc073a0
...
...
@@ -73,7 +73,7 @@
34.
[
在120篇系列专栏中,才能学会 python beautifulsoup4 模块,7000字博客+爬第九工场网
](
https://dream.blog.csdn.net/article/details/120384794
)
35.
[
都说python是万能的,这次用python看溧阳摄影圈,真不错
](
https://dream.blog.csdn.net/article/details/120407050
)
36.
pngimg.com 透明 PNG 图片站采集
36.
[
全程干货,用 python 下载某站全部【免抠图片】,图片背景透明,格式PNG
](
https://dream.blog.csdn.net/article/details/120414397
)
### 📙 协程学习
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录