Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
qq_45598856
white-jotter
提交
a973ad14
W
white-jotter
项目概览
qq_45598856
/
white-jotter
与 Fork 源项目一致
从无法访问的项目Fork
通知
2
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
W
white-jotter
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
a973ad14
编写于
3月 21, 2019
作者:
E
Evan
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
update: add python spiders
上级
404f61ee
变更
2
隐藏空白更改
内联
并排
Showing
2 changed file
with
115 addition
and
0 deletion
+115
-0
wj/pic.py
wj/pic.py
+79
-0
wj/pic2.py
wj/pic2.py
+36
-0
未找到文件。
wj/pic.py
0 → 100644
浏览文件 @
a973ad14
import
requests
import
re
,
os
,
threading
class
myThread
(
threading
.
Thread
):
def
__init__
(
self
,
url
,
dir
,
filename
):
threading
.
Thread
.
__init__
(
self
)
self
.
threadID
=
filename
self
.
url
=
url
self
.
dir
=
dir
self
.
filename
=
filename
def
run
(
self
):
download_pic
(
self
.
url
,
self
.
dir
,
self
.
filename
)
def
download_pic
(
url
,
dir
,
filename
):
req
=
requests
.
get
(
url
=
url
,
headers
=
headers
)
if
req
.
status_code
==
200
:
with
open
(
str
(
dir
)
+
'/'
+
str
(
filename
)
+
'.jpg'
,
'wb'
)
as
f
:
f
.
write
(
req
.
content
)
flag
=
1
headers
=
{
'User-Agent'
:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36Name'
,
'Referer'
:
'http://www.mm131.com/'
}
while
True
:
if
flag
==
1
:
get
=
requests
.
get
(
'http://www.mm131.com/xinggan/'
)
b
=
re
.
findall
(
r
'<dd><a target="_blank" href="http://www.mm131.com/xinggan/([0-9]*).html"><img src='
,
get
.
text
)
for
a
in
b
:
getpage
=
requests
.
get
(
'http://www.mm131.com/xinggan/'
+
str
(
a
)
+
'.html'
)
tittle
=
re
.
findall
(
r
'<h5>(.*)</h5>'
,
str
(
getpage
.
content
,
'gb2312'
,
errors
=
'ignore'
))
pages
=
[]
threads
=
[]
pages
=
re
.
findall
(
r
'<span class="page-ch">共(.*?)页</span>'
,
str
(
getpage
.
content
,
'gb2312'
,
errors
=
'ignore'
))
page
=
pages
[
0
]
download_url
=
'http://img1.mm131.me/pic/'
+
str
(
a
)
+
'/'
for
t
in
tittle
:
if
os
.
path
.
exists
(
t
)
==
False
:
os
.
makedirs
(
t
)
print
(
'开始下载:'
+
t
)
for
page_img
in
range
(
int
(
page
)):
download_img_url
=
download_url
+
str
(
page_img
)
+
'.jpg'
thread
=
myThread
(
download_img_url
,
t
,
page_img
)
thread
.
start
()
threads
.
append
(
thread
)
for
t
in
threads
:
t
.
join
()
print
(
'下载完成'
)
else
:
print
(
'文件夹已存在,跳过'
)
flag
=
flag
+
1
print
(
'这一页的任务已经完成了'
)
else
:
get
=
requests
.
get
(
'http://www.mm131.com/xinggan/list_6_'
+
str
(
flag
)
+
'.html'
)
if
get
.
status_code
==
200
:
b
=
re
.
findall
(
r
'<dd><a target="_blank" href="http://www.mm131.com/xinggan/([0-9]*).html"><img src='
,
get
.
text
)
for
a
in
b
:
getpage
=
requests
.
get
(
'http://www.mm131.com/xinggan/'
+
str
(
a
)
+
'.html'
)
tittle
=
re
.
findall
(
r
'<h5>(.*)</h5>'
,
str
(
getpage
.
content
,
'gb2312'
,
errors
=
'ignore'
))
pages
=
[]
threads
=
[]
pages
=
re
.
findall
(
r
'<span class="page-ch">共(.*?)页</span>'
,
str
(
getpage
.
content
,
'gb2312'
,
errors
=
'ignore'
))
page
=
pages
[
0
]
download_url
=
'http://img1.mm131.me/pic/'
+
str
(
a
)
+
'/'
for
t
in
tittle
:
if
os
.
path
.
exists
(
t
)
==
False
:
os
.
makedirs
(
t
)
print
(
'开始下载:'
+
t
)
for
page_img
in
range
(
int
(
page
)):
download_img_url
=
download_url
+
str
(
page_img
)
+
'.jpg'
thread
=
myThread
(
download_img_url
,
t
,
page_img
)
thread
.
start
()
threads
.
append
(
thread
)
for
t
in
threads
:
t
.
join
()
print
(
'下载完成'
)
else
:
print
(
'文件夹已存在,跳过'
)
flag
=
flag
+
1
print
(
'这一页的任务已经完成了'
)
else
:
break
\ No newline at end of file
wj/pic2.py
0 → 100644
浏览文件 @
a973ad14
# -*- coding:UTF-8 -*-
from
bs4
import
BeautifulSoup
from
urllib.request
import
urlretrieve
import
requests
import
os
import
time
if
__name__
==
'__main__'
:
list_url
=
[]
for
num
in
range
(
1
,
20
):
if
num
==
1
:
url
=
'https://www.meitulu.com/item/8782.html'
else
:
url
=
'https://www.meitulu.com/item8782_%d.html'
%
num
headers
=
{
"User-Agent"
:
"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"
}
req
=
requests
.
get
(
url
=
url
,
headers
=
headers
)
req
.
encoding
=
'utf-8'
html
=
req
.
text
bf
=
BeautifulSoup
(
html
,
'lxml'
)
targets_url
=
bf
.
find_all
(
class_
=
'content_img'
)
for
each
in
targets_url
:
list_url
.
append
(
each
.
get
(
'src'
))
print
(
list_url
)
for
each_img
in
list_url
:
# img_bf_2 = BeautifulSoup(str(each_img), 'lxml')
# img_url = 'http://www.shuaia.net' + img_bf_2.div.img.get('src')
if
'images'
not
in
os
.
listdir
():
os
.
makedirs
(
'images'
)
urlretrieve
(
url
=
each_img
,
filename
=
'images/'
+
'1'
)
time
.
sleep
(
1
)
print
(
'下载完成!'
)
\ No newline at end of file
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录