提交 e8fea871 编写于 作者: donggela's avatar donggela

爬虫练习

上级 f02f63b3
...@@ -6,17 +6,94 @@ import traceback ...@@ -6,17 +6,94 @@ import traceback
from time import sleep from time import sleep
from lxml import etree from lxml import etree
from fake_useragent import UserAgent from fake_useragent import UserAgent
import json
base_url = 'http://www.hshfy.sh.cn/shfy/gweb2017/ktgg_search_content.jsp?' #这里要换成对应Ajax请求中的链接 import string
headers = { headers = {
'authority':"api.eol.cn",
'scheme':'https',
'Connection': 'keep-alive', 'Connection': 'keep-alive',
'Accept': '*/*', 'Accept': 'application/json, text/plain, */*',
'X-Requested-With': 'XMLHttpRequest', 'X-Requested-With': 'XMLHttpRequest',
'User-Agent': '你的User-Agent', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36 Edg/113.0.1774.42',
'Origin': 'http://www.hshfy.sh.cn', 'Origin': 'https://www.gaokao.cn',
'Referer': 'http://www.hshfy.sh.cn/shfy/gweb2017/ktgg_search.jsp?zd=splc', 'Referer': 'https://www.gaokao.cn/',
'Accept-Language': 'zh-CN,zh;q=0.9', 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
'Content-Type': 'application/x-www-form-urlencoded', 'Content-Type': 'application/json'
'Cookie': '你的Cookie' }
} def get_province():
\ No newline at end of file url='https://static-data.gaokao.cn/www/2.0/config/elective/config.json'
response = requests.get(url, timeout=10)
provinces=[]
if(response.status_code==200):
re = response.content.decode('unicode_escape')
data =json.loads(re)['data']['province']
print(data)
for word in string.ascii_uppercase:
if word in data:
provinces.extend(data[word])
provinces.sort(key=lambda x:x['province_id'])
print(provinces)
#specal:院校类型
#keyword:院校名称
#page:页数
#proviceId:省份id
#size:每页数量
#type:办学类型
def get_page(specal=None,keyword="",page=1,proviceId="",size=20,type=""):
base_url = 'https://api.eol.cn/web/api/?' # 这里要换成对应Ajax请求中的链接
n = 3
while True:
try:
sleep(random.uniform(1, 2)) # 随机出现1-2之间的数,包含小数
data = {}
if(specal):
data = {
specal:1,
"keyword": keyword,
"page": page,
"province_id": proviceId,
"ranktype": "",
"request_type": 1,
"size": size,
"type": type,
"uri": 'apidata/api/gkv3/school/lists',
"signsafe": "7d3ad7653039f90d198e9dad129022c6",
}
else:
data = {
"keyword": keyword,
"page": page,
"province_id": proviceId,
"ranktype": "",
"request_type": 1,
"size": size,
"type": type,
"uri": 'apidata/api/gkv3/school/lists',
"signsafe": "7d3ad7653039f90d198e9dad129022c6",
}
url = base_url + urlencode(data)
print(url)
try:
response = requests.request("get",url, headers = headers)
# print(response)
if response.status_code == 200:
re = response.content.decode('utf-8')
print(re)
return re # 解析内容
except requests.ConnectionError as e:
print('Error', e.args) # 输出异常信息
except (TimeoutError, Exception):
n -= 1
if n == 0:
print('请求3次均失败,放弃此url请求,检查请求条件')
return
else:
print('请求失败,重新请求')
continue
if __name__ == '__main__':
# get_province()
get_page()
...@@ -10,26 +10,36 @@ from bs4 import BeautifulSoup ...@@ -10,26 +10,36 @@ from bs4 import BeautifulSoup
headers = { headers = {
'User-Agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.129 Safari/537.36", 'User-Agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.129 Safari/537.36",
} }
for x in range(1,3):
url = 'http://yz.yuzhuprice.com:8003/findPriceByName.jspx?page.curPage={}&priceName=%E7%BA%A2%E6%9C%A8%E7%B1%BB'.format(x)
response = requests.get(url,headers=headers,timeout=10) url = 'http://www.shijie500qiang.com/m/view.php?aid=47'
# soup = BeautifulSoup(response.content, 'html.parser') response = requests.get(url,headers=headers,timeout=10)
# for link in soup.find_all('a'): # soup = BeautifulSoup(response.content, 'html.parser')
# print(link.get('href'))
html = response.text # for link in soup.find_all('a'):
parse = etree.HTML(html) # print(link.get('href'))
all_tr = parse.xpath('//*[@id="173200"]') html = response.text
for tr in all_tr: # print(html)
tr = { parse = etree.HTML(html)
'name': ''.join(tr.xpath('./td[1]/text()')).strip(), all_tr = parse.xpath('//*[@id="modellist-2894592"]/div/div/table/tbody/tr[position()>1]')
'price': ''.join(tr.xpath('./td[2]/text()')).strip(), all_data=[]
'unit': ''.join(tr.xpath('./td[3]/text()')).strip(), for tr in all_tr:
'supermaket': ''.join(tr.xpath('./td[4]/text()')).strip(), tr_data={}
'time': ''.join(tr.xpath('./td[5]/text()')).strip() if tr.xpath("./td/span"):
tr_data = {
"sort":''.join(tr.xpath('./td[1]/text()')).strip(),
'province': ''.join(tr.xpath('./td[2]/span/text()')).strip(),
'value': ''.join(tr.xpath('./td[3]/text()')).strip(),
}
else:
tr_data = {
"sort": ''.join(tr.xpath('./td[1]/text()')).strip(),
'province': ''.join(tr.xpath('./td[2]/text()')).strip(),
'value': ''.join(tr.xpath('./td[3]/text()')).strip(),
} }
print(tr) print(tr_data)
all_data.append(tr_data)
print(all_data)
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册