Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
张益达_py
爬虫100例(复盘中)
提交
546ca312
爬
爬虫100例(复盘中)
项目概览
张益达_py
/
爬虫100例(复盘中)
与 Fork 源项目一致
Fork自
梦想橡皮擦 / 爬虫100例(复盘中)
通知
1
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
爬
爬虫100例(复盘中)
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
546ca312
编写于
7月 22, 2021
作者:
梦想橡皮擦
💬
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
案例五复盘
上级
3d15bb1e
变更
3
隐藏空白更改
内联
并排
Showing
3 changed file
with
277 addition
and
0 deletion
+277
-0
案例5/demo.py
案例5/demo.py
+210
-0
案例5/downs/图片下载目录.txt
案例5/downs/图片下载目录.txt
+0
-0
案例5/http_help.py
案例5/http_help.py
+67
-0
未找到文件。
案例5/demo.py
0 → 100644
浏览文件 @
546ca312
import
http_help
as
hh
import
re
import
threading
import
time
import
os
import
requests
urls_lock
=
threading
.
Lock
()
# url操作锁
imgs_lock
=
threading
.
Lock
()
# 图片操作锁
imgs_start_urls
=
[]
class
Consumer
(
threading
.
Thread
):
def
__init__
(
self
):
threading
.
Thread
.
__init__
(
self
)
self
.
__headers
=
{
"Referer"
:
"http://www.jj20.com/bz/ktmh"
,
"Host"
:
"www.jj20.com"
}
self
.
__res
=
hh
.
R
(
headers
=
self
.
__headers
)
def
download_img
(
self
,
filder
,
img_down_url
,
filename
):
file_path
=
"./downs/{}"
.
format
(
filder
)
if
not
os
.
path
.
exists
(
file_path
):
os
.
mkdir
(
file_path
)
# 创建目录
if
os
.
path
.
exists
(
"./downs/{}/{}"
.
format
(
filder
,
filename
)):
return
else
:
try
:
# 由于图片比较大,请求时间调整到10秒
img
=
requests
.
get
(
img_down_url
,
headers
=
{
"Host"
:
"img.jj20.com"
},
timeout
=
10
)
except
Exception
as
e
:
print
(
e
)
print
(
"{}写入图片"
.
format
(
img_down_url
))
try
:
with
open
(
"./downs/{}/{}"
.
format
(
filder
,
filename
),
"wb+"
)
as
f
:
f
.
write
(
img
.
content
)
except
Exception
as
e
:
print
(
e
)
return
def
run
(
self
):
while
True
:
global
imgs_start_urls
,
imgs_lock
if
len
(
imgs_start_urls
)
>
0
:
if
imgs_lock
.
acquire
():
# 锁定
img_url
=
imgs_start_urls
[
0
]
# 获取到链接之后
del
imgs_start_urls
[
0
]
# 删掉第0项
imgs_lock
.
release
()
# 解锁
else
:
continue
# print("图片开始下载")
img_url
=
"http://www.jj20.com"
+
img_url
[
0
]
title
=
img_url
[
1
]
start_index
=
1
base_url
=
img_url
[
0
:
img_url
.
rindex
(
"."
)]
while
True
:
if
start_index
>
1
:
img_url
=
"{}_{}.html"
.
format
(
base_url
,
start_index
)
content
=
self
.
__res
.
get_content
(
img_url
,
charset
=
"gb2312"
)
if
content
is
not
None
:
pattern
=
re
.
compile
(
"<script>var id='(.*?)';</script>"
)
img_down_url
=
pattern
.
search
(
content
)
# 获取到了图片地址
if
img_down_url
is
not
None
:
filder
=
title
img_down_url
=
"http://www.jj20.com"
+
\
img_down_url
.
group
(
1
)
filename
=
img_down_url
[
img_down_url
.
rindex
(
"/"
)
+
1
:]
self
.
download_img
(
filder
,
img_down_url
,
filename
)
# 下载图片
else
:
print
(
"-"
*
100
)
print
(
content
)
break
# 终止循环体
else
:
print
(
"{}链接加载失败"
.
format
(
img_url
))
if
imgs_lock
.
acquire
():
# 锁定
imgs_start_urls
.
append
(
img_url
)
imgs_lock
.
release
()
# 解锁
start_index
+=
1
# time.sleep(3)
class
Product
(
threading
.
Thread
):
def
__init__
(
self
,
urls
):
threading
.
Thread
.
__init__
(
self
)
self
.
__urls
=
urls
self
.
__headers
=
{
"Referer"
:
"http://www.jj20.com/bz/ktmh"
,
"Host"
:
"www.jj20.com"
}
self
.
__res
=
hh
.
R
(
headers
=
self
.
__headers
)
def
add_fail_url
(
self
,
url
):
print
(
"{}该URL抓取失败"
.
format
(
url
))
global
urls_lock
if
urls_lock
.
acquire
():
self
.
__urls
.
insert
(
0
,
url
)
urls_lock
.
release
()
# 解锁
def
run
(
self
):
print
(
"*"
*
100
)
while
True
:
global
urls_lock
,
imgs_start_urls
if
len
(
self
.
__urls
)
>
0
:
if
urls_lock
.
acquire
():
# 锁定
last_url
=
self
.
__urls
.
pop
()
urls_lock
.
release
()
# 解锁
print
(
"正在操作{}"
.
format
(
last_url
))
content
=
self
.
__res
.
get_content
(
last_url
,
"gb2312"
)
if
content
is
not
None
:
html
=
self
.
get_page_list
(
content
)
if
len
(
html
)
==
0
:
self
.
add_fail_url
(
last_url
)
else
:
if
imgs_lock
.
acquire
():
imgs_start_urls
.
extend
(
html
)
imgs_lock
.
release
()
time
.
sleep
(
5
)
else
:
self
.
add_fail_url
(
last_url
)
else
:
print
(
"所有链接已经运行完毕"
)
break
def
get_page_list
(
self
,
content
):
pattern
=
re
.
compile
(
'<a href="(.*?)" target="_blank"><img src=".*?" width="270" height="151" alt="(.*?)"></a>'
)
list_page
=
re
.
findall
(
pattern
,
content
)
return
list_page
class
ImageList
():
def
__init__
(
self
):
self
.
__start
=
"http://www.jj20.com/bz/ktmh/list_16_{}.html"
# URL模板
self
.
__headers
=
{
"Referer"
:
"http://www.jj20.com/bz/ktmh"
,
"Host"
:
"www.jj20.com"
}
self
.
__res
=
hh
.
R
(
headers
=
self
.
__headers
)
# 初始化访问请求
def
run
(
self
):
page_count
=
43
# int(self.get_page_count())
if
page_count
==
0
:
return
urls
=
[
self
.
__start
.
format
(
i
)
for
i
in
range
(
1
,
page_count
)]
print
(
urls
)
return
urls
# 废弃掉该方法,直接人眼识别总页数
def
get_page_count
(
self
):
content
=
self
.
__res
.
get_content
(
self
.
__start
.
format
(
"1"
),
"gb2312"
)
pattern
=
re
.
compile
(
"<li><a href='list_11_(\d+?).html' target='_self'>末页</a></li>"
)
search_text
=
pattern
.
search
(
content
)
if
search_text
is
not
None
:
count
=
search_text
.
group
(
1
)
return
count
else
:
return
0
if
__name__
==
'__main__'
:
img
=
ImageList
()
urls
=
img
.
run
()
for
i
in
range
(
1
,
2
):
p
=
Product
(
urls
)
p
.
start
()
for
i
in
range
(
1
,
2
):
c
=
Consumer
()
c
.
start
()
案例5/downs/图片下载目录.txt
0 → 100644
浏览文件 @
546ca312
案例5/http_help.py
0 → 100644
浏览文件 @
546ca312
import
requests
from
retrying
import
retry
import
random
import
datetime
class
R
:
# 类的初始化方法
def
__init__
(
self
,
method
=
"get"
,
params
=
None
,
headers
=
None
,
cookies
=
None
):
self
.
__method
=
method
myheaders
=
self
.
get_headers
()
if
headers
is
not
None
:
myheaders
.
update
(
headers
)
self
.
__headers
=
myheaders
self
.
__cookies
=
cookies
self
.
__params
=
params
def
get_headers
(
self
):
user_agent_list
=
[
\
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1"
\
"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11"
,
\
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6"
,
\
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6"
,
\
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1"
,
\
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5"
,
\
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5"
,
\
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3"
,
\
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3"
,
\
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3"
,
\
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3"
,
\
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3"
,
\
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3"
,
\
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3"
,
\
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3"
,
\
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3"
,
\
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
,
\
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
]
UserAgent
=
random
.
choice
(
user_agent_list
)
headers
=
{
'User-Agent'
:
UserAgent
}
return
headers
@
retry
(
stop_max_attempt_number
=
3
)
def
__retrying_requests
(
self
,
url
):
if
self
.
__method
==
"get"
:
response
=
requests
.
get
(
url
,
headers
=
self
.
__headers
,
cookies
=
self
.
__cookies
,
timeout
=
3
)
else
:
response
=
requests
.
post
(
url
,
params
=
self
.
__params
,
headers
=
self
.
__headers
,
cookies
=
self
.
__cookies
,
timeout
=
3
)
return
response
.
content
# get请求
def
get_content
(
self
,
url
,
charset
=
"utf-8"
):
try
:
html_str
=
self
.
__retrying_requests
(
url
).
decode
(
charset
)
except
:
html_str
=
None
return
html_str
def
get_file
(
self
,
file_url
):
try
:
file
=
self
.
__retrying_requests
(
file_url
)
except
:
file
=
None
return
file
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录