diff --git a/dynamic.py b/dynamic.py index e156f0df5bfad7e7ec9167007fd1449953a0836e..a5c3a4e1c15e150303edc031a67bda565167f6cf 100644 --- a/dynamic.py +++ b/dynamic.py @@ -6,17 +6,94 @@ import traceback from time import sleep from lxml import etree from fake_useragent import UserAgent - -base_url = 'http://www.hshfy.sh.cn/shfy/gweb2017/ktgg_search_content.jsp?' #这里要换成对应Ajax请求中的链接 +import json +import string headers = { + 'authority':"api.eol.cn", + 'scheme':'https', 'Connection': 'keep-alive', - 'Accept': '*/*', + 'Accept': 'application/json, text/plain, */*', 'X-Requested-With': 'XMLHttpRequest', - 'User-Agent': '你的User-Agent', - 'Origin': 'http://www.hshfy.sh.cn', - 'Referer': 'http://www.hshfy.sh.cn/shfy/gweb2017/ktgg_search.jsp?zd=splc', - 'Accept-Language': 'zh-CN,zh;q=0.9', - 'Content-Type': 'application/x-www-form-urlencoded', - 'Cookie': '你的Cookie' -} \ No newline at end of file + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36 Edg/113.0.1774.42', + 'Origin': 'https://www.gaokao.cn', + 'Referer': 'https://www.gaokao.cn/', + 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6', + 'Content-Type': 'application/json' +} +def get_province(): + url='https://static-data.gaokao.cn/www/2.0/config/elective/config.json' + response = requests.get(url, timeout=10) + provinces=[] + if(response.status_code==200): + re = response.content.decode('unicode_escape') + data =json.loads(re)['data']['province'] + print(data) + for word in string.ascii_uppercase: + if word in data: + provinces.extend(data[word]) + provinces.sort(key=lambda x:x['province_id']) + print(provinces) +#specal:院校类型 +#keyword:院校名称 +#page:页数 +#proviceId:省份id +#size:每页数量 +#type:办学类型 +def get_page(specal=None,keyword="",page=1,proviceId="",size=20,type=""): + base_url = 'https://api.eol.cn/web/api/?' # 这里要换成对应Ajax请求中的链接 + n = 3 + while True: + try: + sleep(random.uniform(1, 2)) # 随机出现1-2之间的数,包含小数 + data = {} + if(specal): + data = { + specal:1, + "keyword": keyword, + "page": page, + "province_id": proviceId, + "ranktype": "", + "request_type": 1, + "size": size, + "type": type, + "uri": 'apidata/api/gkv3/school/lists', + "signsafe": "7d3ad7653039f90d198e9dad129022c6", + } + else: + data = { + "keyword": keyword, + "page": page, + "province_id": proviceId, + "ranktype": "", + "request_type": 1, + "size": size, + "type": type, + "uri": 'apidata/api/gkv3/school/lists', + "signsafe": "7d3ad7653039f90d198e9dad129022c6", + } + + url = base_url + urlencode(data) + print(url) + try: + response = requests.request("get",url, headers = headers) + # print(response) + if response.status_code == 200: + re = response.content.decode('utf-8') + print(re) + return re # 解析内容 + except requests.ConnectionError as e: + print('Error', e.args) # 输出异常信息 + except (TimeoutError, Exception): + n -= 1 + if n == 0: + print('请求3次均失败,放弃此url请求,检查请求条件') + return + else: + print('请求失败,重新请求') + continue + + +if __name__ == '__main__': + # get_province() + get_page() diff --git a/staticHTML.py b/staticHTML.py index 7d4d137d666fdabd1808b95b83887c4eea1ac57c..147018258b87c060109ab853a13f94be90f3dba6 100644 --- a/staticHTML.py +++ b/staticHTML.py @@ -10,26 +10,36 @@ from bs4 import BeautifulSoup headers = { 'User-Agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.129 Safari/537.36", } -for x in range(1,3): - url = 'http://yz.yuzhuprice.com:8003/findPriceByName.jspx?page.curPage={}&priceName=%E7%BA%A2%E6%9C%A8%E7%B1%BB'.format(x) - response = requests.get(url,headers=headers,timeout=10) +url = 'http://www.shijie500qiang.com/m/view.php?aid=47' - # soup = BeautifulSoup(response.content, 'html.parser') +response = requests.get(url,headers=headers,timeout=10) - # for link in soup.find_all('a'): - # print(link.get('href')) - html = response.text - parse = etree.HTML(html) - all_tr = parse.xpath('//*[@id="173200"]') - for tr in all_tr: - tr = { - 'name': ''.join(tr.xpath('./td[1]/text()')).strip(), - 'price': ''.join(tr.xpath('./td[2]/text()')).strip(), - 'unit': ''.join(tr.xpath('./td[3]/text()')).strip(), - 'supermaket': ''.join(tr.xpath('./td[4]/text()')).strip(), - 'time': ''.join(tr.xpath('./td[5]/text()')).strip() +# soup = BeautifulSoup(response.content, 'html.parser') + +# for link in soup.find_all('a'): +# print(link.get('href')) +html = response.text +# print(html) +parse = etree.HTML(html) +all_tr = parse.xpath('//*[@id="modellist-2894592"]/div/div/table/tbody/tr[position()>1]') +all_data=[] +for tr in all_tr: + tr_data={} + if tr.xpath("./td/span"): + tr_data = { + "sort":''.join(tr.xpath('./td[1]/text()')).strip(), + 'province': ''.join(tr.xpath('./td[2]/span/text()')).strip(), + 'value': ''.join(tr.xpath('./td[3]/text()')).strip(), + } + else: + tr_data = { + "sort": ''.join(tr.xpath('./td[1]/text()')).strip(), + 'province': ''.join(tr.xpath('./td[2]/text()')).strip(), + 'value': ''.join(tr.xpath('./td[3]/text()')).strip(), } - print(tr) + print(tr_data) + all_data.append(tr_data) +print(all_data) + -