Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
donggela
Python
提交
e8fea871
P
Python
项目概览
donggela
/
Python
与 Fork 源项目一致
Fork自
inscode / Python
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Python
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
提交
e8fea871
编写于
5月 23, 2023
作者:
donggela
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
爬虫练习
上级
f02f63b3
变更
2
隐藏空白更改
内联
并排
Showing
2 changed file
with
115 addition
and
28 deletion
+115
-28
dynamic.py
dynamic.py
+87
-10
staticHTML.py
staticHTML.py
+28
-18
未找到文件。
dynamic.py
浏览文件 @
e8fea871
...
@@ -6,17 +6,94 @@ import traceback
...
@@ -6,17 +6,94 @@ import traceback
from
time
import
sleep
from
time
import
sleep
from
lxml
import
etree
from
lxml
import
etree
from
fake_useragent
import
UserAgent
from
fake_useragent
import
UserAgent
import
json
base_url
=
'http://www.hshfy.sh.cn/shfy/gweb2017/ktgg_search_content.jsp?'
#这里要换成对应Ajax请求中的链接
import
string
headers
=
{
headers
=
{
'authority'
:
"api.eol.cn"
,
'scheme'
:
'https'
,
'Connection'
:
'keep-alive'
,
'Connection'
:
'keep-alive'
,
'Accept'
:
'*/*'
,
'Accept'
:
'
application/json, text/plain,
*/*'
,
'X-Requested-With'
:
'XMLHttpRequest'
,
'X-Requested-With'
:
'XMLHttpRequest'
,
'User-Agent'
:
'你的User-Agent'
,
'User-Agent'
:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36 Edg/113.0.1774.42'
,
'Origin'
:
'http://www.hshfy.sh.cn'
,
'Origin'
:
'https://www.gaokao.cn'
,
'Referer'
:
'http://www.hshfy.sh.cn/shfy/gweb2017/ktgg_search.jsp?zd=splc'
,
'Referer'
:
'https://www.gaokao.cn/'
,
'Accept-Language'
:
'zh-CN,zh;q=0.9'
,
'Accept-Language'
:
'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6'
,
'Content-Type'
:
'application/x-www-form-urlencoded'
,
'Content-Type'
:
'application/json'
'Cookie'
:
'你的Cookie'
}
}
def
get_province
():
\ No newline at end of file
url
=
'https://static-data.gaokao.cn/www/2.0/config/elective/config.json'
response
=
requests
.
get
(
url
,
timeout
=
10
)
provinces
=
[]
if
(
response
.
status_code
==
200
):
re
=
response
.
content
.
decode
(
'unicode_escape'
)
data
=
json
.
loads
(
re
)[
'data'
][
'province'
]
print
(
data
)
for
word
in
string
.
ascii_uppercase
:
if
word
in
data
:
provinces
.
extend
(
data
[
word
])
provinces
.
sort
(
key
=
lambda
x
:
x
[
'province_id'
])
print
(
provinces
)
#specal:院校类型
#keyword:院校名称
#page:页数
#proviceId:省份id
#size:每页数量
#type:办学类型
def
get_page
(
specal
=
None
,
keyword
=
""
,
page
=
1
,
proviceId
=
""
,
size
=
20
,
type
=
""
):
base_url
=
'https://api.eol.cn/web/api/?'
# 这里要换成对应Ajax请求中的链接
n
=
3
while
True
:
try
:
sleep
(
random
.
uniform
(
1
,
2
))
# 随机出现1-2之间的数,包含小数
data
=
{}
if
(
specal
):
data
=
{
specal
:
1
,
"keyword"
:
keyword
,
"page"
:
page
,
"province_id"
:
proviceId
,
"ranktype"
:
""
,
"request_type"
:
1
,
"size"
:
size
,
"type"
:
type
,
"uri"
:
'apidata/api/gkv3/school/lists'
,
"signsafe"
:
"7d3ad7653039f90d198e9dad129022c6"
,
}
else
:
data
=
{
"keyword"
:
keyword
,
"page"
:
page
,
"province_id"
:
proviceId
,
"ranktype"
:
""
,
"request_type"
:
1
,
"size"
:
size
,
"type"
:
type
,
"uri"
:
'apidata/api/gkv3/school/lists'
,
"signsafe"
:
"7d3ad7653039f90d198e9dad129022c6"
,
}
url
=
base_url
+
urlencode
(
data
)
print
(
url
)
try
:
response
=
requests
.
request
(
"get"
,
url
,
headers
=
headers
)
# print(response)
if
response
.
status_code
==
200
:
re
=
response
.
content
.
decode
(
'utf-8'
)
print
(
re
)
return
re
# 解析内容
except
requests
.
ConnectionError
as
e
:
print
(
'Error'
,
e
.
args
)
# 输出异常信息
except
(
TimeoutError
,
Exception
):
n
-=
1
if
n
==
0
:
print
(
'请求3次均失败,放弃此url请求,检查请求条件'
)
return
else
:
print
(
'请求失败,重新请求'
)
continue
if
__name__
==
'__main__'
:
# get_province()
get_page
()
staticHTML.py
浏览文件 @
e8fea871
...
@@ -10,26 +10,36 @@ from bs4 import BeautifulSoup
...
@@ -10,26 +10,36 @@ from bs4 import BeautifulSoup
headers
=
{
headers
=
{
'User-Agent'
:
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.129 Safari/537.36"
,
'User-Agent'
:
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.129 Safari/537.36"
,
}
}
for
x
in
range
(
1
,
3
):
url
=
'http://yz.yuzhuprice.com:8003/findPriceByName.jspx?page.curPage={}&priceName=%E7%BA%A2%E6%9C%A8%E7%B1%BB'
.
format
(
x
)
response
=
requests
.
get
(
url
,
headers
=
headers
,
timeout
=
10
)
url
=
'http://www.shijie500qiang.com/m/view.php?aid=47'
# soup = BeautifulSoup(response.content, 'html.parser'
)
response
=
requests
.
get
(
url
,
headers
=
headers
,
timeout
=
10
)
# for link in soup.find_all('a'):
# soup = BeautifulSoup(response.content, 'html.parser')
# print(link.get('href'))
html
=
response
.
text
# for link in soup.find_all('a'):
parse
=
etree
.
HTML
(
html
)
# print(link.get('href'))
all_tr
=
parse
.
xpath
(
'//*[@id="173200"]'
)
html
=
response
.
text
for
tr
in
all_tr
:
# print(html)
tr
=
{
parse
=
etree
.
HTML
(
html
)
'name'
:
''
.
join
(
tr
.
xpath
(
'./td[1]/text()'
)).
strip
(),
all_tr
=
parse
.
xpath
(
'//*[@id="modellist-2894592"]/div/div/table/tbody/tr[position()>1]'
)
'price'
:
''
.
join
(
tr
.
xpath
(
'./td[2]/text()'
)).
strip
(),
all_data
=
[]
'unit'
:
''
.
join
(
tr
.
xpath
(
'./td[3]/text()'
)).
strip
(),
for
tr
in
all_tr
:
'supermaket'
:
''
.
join
(
tr
.
xpath
(
'./td[4]/text()'
)).
strip
(),
tr_data
=
{}
'time'
:
''
.
join
(
tr
.
xpath
(
'./td[5]/text()'
)).
strip
()
if
tr
.
xpath
(
"./td/span"
):
tr_data
=
{
"sort"
:
''
.
join
(
tr
.
xpath
(
'./td[1]/text()'
)).
strip
(),
'province'
:
''
.
join
(
tr
.
xpath
(
'./td[2]/span/text()'
)).
strip
(),
'value'
:
''
.
join
(
tr
.
xpath
(
'./td[3]/text()'
)).
strip
(),
}
else
:
tr_data
=
{
"sort"
:
''
.
join
(
tr
.
xpath
(
'./td[1]/text()'
)).
strip
(),
'province'
:
''
.
join
(
tr
.
xpath
(
'./td[2]/text()'
)).
strip
(),
'value'
:
''
.
join
(
tr
.
xpath
(
'./td[3]/text()'
)).
strip
(),
}
}
print
(
tr
)
print
(
tr_data
)
all_data
.
append
(
tr_data
)
print
(
all_data
)
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录