提交 e8fea871 编写于 作者: donggela's avatar donggela

爬虫练习

上级 f02f63b3
......@@ -6,17 +6,94 @@ import traceback
from time import sleep
from lxml import etree
from fake_useragent import UserAgent
base_url = 'http://www.hshfy.sh.cn/shfy/gweb2017/ktgg_search_content.jsp?' #这里要换成对应Ajax请求中的链接
import json
import string
headers = {
'authority':"api.eol.cn",
'scheme':'https',
'Connection': 'keep-alive',
'Accept': '*/*',
'Accept': 'application/json, text/plain, */*',
'X-Requested-With': 'XMLHttpRequest',
'User-Agent': '你的User-Agent',
'Origin': 'http://www.hshfy.sh.cn',
'Referer': 'http://www.hshfy.sh.cn/shfy/gweb2017/ktgg_search.jsp?zd=splc',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Content-Type': 'application/x-www-form-urlencoded',
'Cookie': '你的Cookie'
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36 Edg/113.0.1774.42',
'Origin': 'https://www.gaokao.cn',
'Referer': 'https://www.gaokao.cn/',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
'Content-Type': 'application/json'
}
def get_province():
url='https://static-data.gaokao.cn/www/2.0/config/elective/config.json'
response = requests.get(url, timeout=10)
provinces=[]
if(response.status_code==200):
re = response.content.decode('unicode_escape')
data =json.loads(re)['data']['province']
print(data)
for word in string.ascii_uppercase:
if word in data:
provinces.extend(data[word])
provinces.sort(key=lambda x:x['province_id'])
print(provinces)
#specal:院校类型
#keyword:院校名称
#page:页数
#proviceId:省份id
#size:每页数量
#type:办学类型
def get_page(specal=None,keyword="",page=1,proviceId="",size=20,type=""):
base_url = 'https://api.eol.cn/web/api/?' # 这里要换成对应Ajax请求中的链接
n = 3
while True:
try:
sleep(random.uniform(1, 2)) # 随机出现1-2之间的数,包含小数
data = {}
if(specal):
data = {
specal:1,
"keyword": keyword,
"page": page,
"province_id": proviceId,
"ranktype": "",
"request_type": 1,
"size": size,
"type": type,
"uri": 'apidata/api/gkv3/school/lists',
"signsafe": "7d3ad7653039f90d198e9dad129022c6",
}
else:
data = {
"keyword": keyword,
"page": page,
"province_id": proviceId,
"ranktype": "",
"request_type": 1,
"size": size,
"type": type,
"uri": 'apidata/api/gkv3/school/lists',
"signsafe": "7d3ad7653039f90d198e9dad129022c6",
}
url = base_url + urlencode(data)
print(url)
try:
response = requests.request("get",url, headers = headers)
# print(response)
if response.status_code == 200:
re = response.content.decode('utf-8')
print(re)
return re # 解析内容
except requests.ConnectionError as e:
print('Error', e.args) # 输出异常信息
except (TimeoutError, Exception):
n -= 1
if n == 0:
print('请求3次均失败,放弃此url请求,检查请求条件')
return
else:
print('请求失败,重新请求')
continue
if __name__ == '__main__':
# get_province()
get_page()
......@@ -10,26 +10,36 @@ from bs4 import BeautifulSoup
headers = {
'User-Agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.129 Safari/537.36",
}
for x in range(1,3):
url = 'http://yz.yuzhuprice.com:8003/findPriceByName.jspx?page.curPage={}&priceName=%E7%BA%A2%E6%9C%A8%E7%B1%BB'.format(x)
response = requests.get(url,headers=headers,timeout=10)
url = 'http://www.shijie500qiang.com/m/view.php?aid=47'
# soup = BeautifulSoup(response.content, 'html.parser')
response = requests.get(url,headers=headers,timeout=10)
# for link in soup.find_all('a'):
# print(link.get('href'))
html = response.text
parse = etree.HTML(html)
all_tr = parse.xpath('//*[@id="173200"]')
for tr in all_tr:
tr = {
'name': ''.join(tr.xpath('./td[1]/text()')).strip(),
'price': ''.join(tr.xpath('./td[2]/text()')).strip(),
'unit': ''.join(tr.xpath('./td[3]/text()')).strip(),
'supermaket': ''.join(tr.xpath('./td[4]/text()')).strip(),
'time': ''.join(tr.xpath('./td[5]/text()')).strip()
# soup = BeautifulSoup(response.content, 'html.parser')
# for link in soup.find_all('a'):
# print(link.get('href'))
html = response.text
# print(html)
parse = etree.HTML(html)
all_tr = parse.xpath('//*[@id="modellist-2894592"]/div/div/table/tbody/tr[position()>1]')
all_data=[]
for tr in all_tr:
tr_data={}
if tr.xpath("./td/span"):
tr_data = {
"sort":''.join(tr.xpath('./td[1]/text()')).strip(),
'province': ''.join(tr.xpath('./td[2]/span/text()')).strip(),
'value': ''.join(tr.xpath('./td[3]/text()')).strip(),
}
else:
tr_data = {
"sort": ''.join(tr.xpath('./td[1]/text()')).strip(),
'province': ''.join(tr.xpath('./td[2]/text()')).strip(),
'value': ''.join(tr.xpath('./td[3]/text()')).strip(),
}
print(tr)
print(tr_data)
all_data.append(tr_data)
print(all_data)
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册