Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
qyhua
Python 爬虫120例
提交
90c810ec
Python 爬虫120例
项目概览
qyhua
/
Python 爬虫120例
与 Fork 源项目一致
Fork自
梦想橡皮擦 / Python 爬虫120例
通知
1
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
Python 爬虫120例
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
提交
90c810ec
编写于
7月 27, 2021
作者:
梦想橡皮擦
💬
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
代理服务器采集
上级
76835302
变更
2
隐藏空白更改
内联
并排
Showing
2 changed file
with
231 addition
and
0 deletion
+231
-0
NO15/ipporxy.txt
NO15/ipporxy.txt
+1
-0
NO15/代理IP.py
NO15/代理IP.py
+230
-0
未找到文件。
NO15/ipporxy.txt
0 → 100644
浏览文件 @
90c810ec
27.14.86.171:8000
NO15/代理IP.py
0 → 100644
浏览文件 @
90c810ec
import
requests
from
lxml
import
etree
import
random
import
telnetlib
# 代理检测函数
def
check_ip_port
(
ip_port
):
for
item
in
ip_port
:
ip
=
item
[
"ip"
]
port
=
item
[
"port"
]
try
:
tn
=
telnetlib
.
Telnet
(
ip
,
port
=
port
,
timeout
=
3
)
except
:
print
(
'[-] ip:{}:{}'
.
format
(
ip
,
port
))
else
:
print
(
'[+] ip:{}:{}'
.
format
(
ip
,
port
))
with
open
(
'ipporxy.txt'
,
'a'
)
as
f
:
f
.
write
(
ip
+
':'
+
port
+
'
\n
'
)
print
(
"阶段性检测完毕"
)
# 代理IP的第二种检测办法
def
check_proxy
(
ip_port
):
for
item
in
ip_port
:
ip
=
item
[
"ip"
]
port
=
item
[
"port"
]
# url = 'https://api.ipify.org/?format=json'
url
=
"http://icanhazip.com/"
proxies
=
{
"http"
:
"http://{}:{}"
.
format
(
ip
,
port
),
"https"
:
"https://{}:{}"
.
format
(
ip
,
port
),
}
try
:
# res = requests.get(url, proxies=proxies, timeout=3).json()
res
=
requests
.
get
(
url
,
proxies
=
proxies
,
timeout
=
3
)
# if 'ip' in res:
# print(res['ip'])
if
res
.
status_code
==
200
:
print
(
res
)
except
Exception
as
e
:
print
(
e
)
def
ip89
(
pagesize
):
url_format
=
"https://www.89ip.cn/index_{}.html"
for
page
in
range
(
1
,
pagesize
+
1
):
url
=
url_format
.
format
(
page
)
text
=
get_html
(
url
)
ip_xpath
=
'//tbody/tr/td[1]/text()'
port_xpath
=
'//tbody/tr/td[2]/text()'
ret
=
format_html
(
text
,
ip_xpath
,
port_xpath
)
# 检测代理是否可用
check_ip_port
(
ret
)
# check_proxy(ret)
def
ip66
(
pagesize
):
url_format
=
"http://www.66ip.cn/{}.html"
for
page
in
range
(
1
,
pagesize
+
1
):
url
=
url_format
.
format
(
page
)
text
=
get_html
(
url
)
ip_xpath
=
'//table/tr[position()>1]/td[1]/text()'
port_xpath
=
'//table/tr[position()>1]/td[2]/text()'
ret
=
format_html
(
text
,
ip_xpath
,
port_xpath
)
# 检测代理是否可用
check_ip_port
(
ret
)
def
ip3366
(
pagesize
):
url_format
=
"https://proxy.ip3366.net/free/?action=china&page={}"
for
page
in
range
(
1
,
pagesize
+
1
):
url
=
url_format
.
format
(
page
)
text
=
get_html
(
url
)
ip_xpath
=
'//td[@data-title="IP"]/text()'
port_xpath
=
'//td[@data-title="PORT"]/text()'
ret
=
format_html
(
text
,
ip_xpath
,
port_xpath
)
# 检测代理是否可用
check_ip_port
(
ret
)
def
ip_huan
():
urls
=
[
"https://ip.ihuan.me/?page=b97827cc"
,
"https://ip.ihuan.me/?page=4ce63706"
,
"https://ip.ihuan.me/?page=5crfe930"
,
"https://ip.ihuan.me/?page=f3k1d581"
,
"https://ip.ihuan.me/?page=ce1d45977"
,
"https://ip.ihuan.me/?page=881aaf7b5"
]
for
url
in
urls
:
text
=
get_html
(
url
)
ip_xpath
=
'//tbody/tr/td[1]/a/text()'
port_xpath
=
'//tbody/tr/td[2]/text()'
ret
=
format_html
(
text
,
ip_xpath
,
port_xpath
)
check_ip_port
(
ret
)
def
ip_kuai
(
pagesize
):
url_format
=
"https://www.kuaidaili.com/free/inha/{}/"
for
page
in
range
(
1
,
pagesize
+
1
):
url
=
url_format
.
format
(
page
)
text
=
get_html
(
url
)
ip_xpath
=
'//td[@data-title="IP"]/text()'
port_xpath
=
'//td[@data-title="PORT"]/text()'
ret
=
format_html
(
text
,
ip_xpath
,
port_xpath
)
check_ip_port
(
ret
)
def
ip_jiangxi
(
pagesize
):
url_format
=
"https://ip.jiangxianli.com/?page={}"
for
page
in
range
(
1
,
pagesize
+
1
):
url
=
url_format
.
format
(
page
)
text
=
get_html
(
url
)
ip_xpath
=
'//tbody/tr[position()!=7]/td[1]/text()'
port_xpath
=
'//tbody/tr[position()!=7]/td[2]/text()'
ret
=
format_html
(
text
,
ip_xpath
,
port_xpath
)
check_ip_port
(
ret
)
def
ip_kaixin
(
pagesize
):
url_format
=
"http://www.kxdaili.com/dailiip/1/{}.html"
for
page
in
range
(
1
,
pagesize
+
1
):
url
=
url_format
.
format
(
page
)
text
=
get_html
(
url
)
ip_xpath
=
'//tbody/tr/td[1]/text()'
port_xpath
=
'//tbody/tr/td[2]/text()'
ret
=
format_html
(
text
,
ip_xpath
,
port_xpath
)
check_ip_port
(
ret
)
def
ip_nima
(
pagesize
):
url_format
=
"http://www.nimadaili.com/putong/{}/"
for
page
in
range
(
1
,
pagesize
+
1
):
url
=
url_format
.
format
(
page
)
text
=
get_html
(
url
)
ip_xpath
=
'//tbody/tr/td[1]/text()'
ret
=
format_html_ext
(
text
,
ip_xpath
)
check_ip_port
(
ret
)
def
format_html_ext
(
text
,
ip_xpath
):
# 待返回的IP与端口列表
ret
=
[]
html
=
etree
.
HTML
(
text
)
ips
=
html
.
xpath
(
ip_xpath
)
print
(
ips
)
for
ip
in
ips
:
item_dict
=
{
"ip"
:
ip
.
split
(
":"
)[
0
],
"port"
:
ip
.
split
(
":"
)[
1
]
}
ret
.
append
(
item_dict
)
return
ret
def
format_html
(
text
,
ip_xpath
,
port_xpath
):
# 待返回的IP与端口列表
ret
=
[]
html
=
etree
.
HTML
(
text
)
ips
=
html
.
xpath
(
ip_xpath
)
ports
=
html
.
xpath
(
port_xpath
)
# 测试,正式运行删除本部分代码
print
(
ips
,
ports
)
ip_port
=
zip
(
ips
,
ports
)
for
ip
,
port
in
ip_port
:
item_dict
=
{
"ip"
:
ip
.
strip
(),
"port"
:
port
.
strip
()
}
ret
.
append
(
item_dict
)
return
ret
def
get_html
(
url
):
headers
=
get_headers
()
try
:
res
=
requests
.
get
(
url
,
headers
=
headers
,
timeout
=
5
)
return
res
.
text
except
Exception
as
e
:
print
(
"请求网址异常"
,
e
)
return
None
def
get_headers
():
uas
=
[
"Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)"
,
"Mozilla/5.0 (compatible; Baiduspider-render/2.0; +http://www.baidu.com/search/spider.html)"
,
"Baiduspider-image+(+http://www.baidu.com/search/spider.htm)"
,
"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 YisouSpider/5.0 Safari/537.36"
,
"Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
,
"Mozilla/5.0 (compatible; Googlebot-Image/1.0; +http://www.google.com/bot.html)"
,
"Sogou web spider/4.0(+http://www.sogou.com/docs/help/webmasters.htm#07)"
,
"Sogou News Spider/4.0(+http://www.sogou.com/docs/help/webmasters.htm#07)"
,
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0);"
,
"Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)"
,
"Sosospider+(+http://help.soso.com/webspider.htm)"
,
"Mozilla/5.0 (compatible; Yahoo! Slurp China; http://misc.yahoo.com.cn/help.html)"
]
ua
=
random
.
choice
(
uas
)
headers
=
{
"user-agent"
:
ua
,
"referer"
:
"https://www.baidu.com"
}
return
headers
def
run
():
ip89
(
10
)
ip66
(
10
)
ip3366
(
2
)
ip_huan
()
ip_kuai
(
4
)
ip_jiangxi
(
4
)
ip_kaixin
(
10
)
ip_nima
(
5
)
if
__name__
==
"__main__"
:
run
()
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录