爬虫练习

e8fea871 · donggela · f02f63b3 · e8fea871 · e8fea871
隐藏空白更改
内联并排

Showing with 115 addition and 28 deletion

dynamic.py dynamic.py +87 -10

staticHTML.py staticHTML.py +28 -18

未找到文件。
--- a/dynamic.py
+++ b/dynamic.py
@@ -6,17 +6,94 @@ import traceback
 from time import sleep
 from lxml import etree 
 from fake_useragent import UserAgent
+import json
-base_url = 'http://www.hshfy.sh.cn/shfy/gweb2017/ktgg_search_content.jsp?'  #这里要换成对应Ajax请求中的链接
+import string
 headers = {
+    'authority':"api.eol.cn",
+    'scheme':'https',
    'Connection': 'keep-alive',
-    'Accept': '*/*',
+    'Accept': 'application/json, text/plain, */*',
    'X-Requested-With': 'XMLHttpRequest',
-    'User-Agent': '你的User-Agent',
+    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36 Edg/113.0.1774.42',
-    'Origin': 'http://www.hshfy.sh.cn',
+    'Origin': 'https://www.gaokao.cn',
-    'Referer': 'http://www.hshfy.sh.cn/shfy/gweb2017/ktgg_search.jsp?zd=splc',
+    'Referer': 'https://www.gaokao.cn/',
-    'Accept-Language': 'zh-CN,zh;q=0.9',
+    'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
-    'Content-Type': 'application/x-www-form-urlencoded',
+    'Content-Type': 'application/json'
-    'Cookie': '你的Cookie'
+}
-}
+def get_province():
\ No newline at end of file
+    url='https://static-data.gaokao.cn/www/2.0/config/elective/config.json'
+    response = requests.get(url, timeout=10)
+    provinces=[]
+    if(response.status_code==200):
+        re = response.content.decode('unicode_escape')
+        data =json.loads(re)['data']['province']
+        print(data)
+        for word in string.ascii_uppercase:
+            if word in data:
+                provinces.extend(data[word])
+        provinces.sort(key=lambda x:x['province_id'])
+        print(provinces)
+#specal:院校类型
+#keyword:院校名称
+#page:页数
+#proviceId:省份id
+#size:每页数量
+#type:办学类型
+def get_page(specal=None,keyword="",page=1,proviceId="",size=20,type=""):
+    base_url = 'https://api.eol.cn/web/api/?'  # 这里要换成对应Ajax请求中的链接
+    n = 3
+    while True:
+        try:
+            sleep(random.uniform(1, 2))  # 随机出现1-2之间的数，包含小数
+            data = {}
+            if(specal):
+                data = {
+                    specal:1,
+                    "keyword": keyword,
+                    "page": page,
+                    "province_id": proviceId,
+                    "ranktype": "",
+                    "request_type": 1,
+                    "size": size,
+                    "type": type,
+                    "uri": 'apidata/api/gkv3/school/lists',
+                    "signsafe": "7d3ad7653039f90d198e9dad129022c6",
+                }
+            else:
+                data = {
+                    "keyword": keyword,
+                    "page": page,
+                    "province_id": proviceId,
+                    "ranktype": "",
+                    "request_type": 1,
+                    "size": size,
+                    "type": type,
+                    "uri": 'apidata/api/gkv3/school/lists',
+                    "signsafe": "7d3ad7653039f90d198e9dad129022c6",
+                }
+            url = base_url + urlencode(data)
+            print(url)
+            try:
+                response = requests.request("get",url, headers = headers)
+                # print(response)
+                if response.status_code == 200:
+                    re = response.content.decode('utf-8')
+                    print(re)
+                    return re  # 解析内容
+            except requests.ConnectionError as e:
+                print('Error', e.args)  # 输出异常信息
+        except (TimeoutError, Exception):
+            n -= 1
+            if n == 0:
+                print('请求3次均失败，放弃此url请求,检查请求条件')
+                return
+            else:
+                print('请求失败，重新请求')
+                continue
+if __name__ == '__main__':
+    # get_province()
+    get_page()
--- a/staticHTML.py
+++ b/staticHTML.py
@@ -10,26 +10,36 @@ from bs4 import BeautifulSoup
 headers = {
    'User-Agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.129 Safari/537.36",
 }
-for x in range(1,3):
-    url = 'http://yz.yuzhuprice.com:8003/findPriceByName.jspx?page.curPage={}&priceName=%E7%BA%A2%E6%9C%A8%E7%B1%BB'.format(x)
-    response = requests.get(url,headers=headers,timeout=10)
+url = 'http://www.shijie500qiang.com/m/view.php?aid=47'
-    # soup = BeautifulSoup(response.content, 'html.parser')
+response = requests.get(url,headers=headers,timeout=10)
-    # for link in soup.find_all('a'):
+# soup = BeautifulSoup(response.content, 'html.parser')
-    #     print(link.get('href'))
-    html = response.text
+# for link in soup.find_all('a'):
-    parse = etree.HTML(html) 
+#     print(link.get('href'))
-    all_tr = parse.xpath('//*[@id="173200"]')
+html = response.text
-    for tr in all_tr:
+# print(html)
-        tr = {
+parse = etree.HTML(html)
-            'name': ''.join(tr.xpath('./td[1]/text()')).strip(),
+all_tr = parse.xpath('//*[@id="modellist-2894592"]/div/div/table/tbody/tr[position()>1]')
-            'price': ''.join(tr.xpath('./td[2]/text()')).strip(),
+all_data=[]
-            'unit': ''.join(tr.xpath('./td[3]/text()')).strip(),
+for tr in all_tr:
-            'supermaket': ''.join(tr.xpath('./td[4]/text()')).strip(),
+    tr_data={}
-            'time': ''.join(tr.xpath('./td[5]/text()')).strip()
+    if tr.xpath("./td/span"):
+        tr_data = {
+            "sort":''.join(tr.xpath('./td[1]/text()')).strip(),
+            'province': ''.join(tr.xpath('./td[2]/span/text()')).strip(),
+            'value': ''.join(tr.xpath('./td[3]/text()')).strip(),
+        }
+    else:
+        tr_data = {
+            "sort": ''.join(tr.xpath('./td[1]/text()')).strip(),
+            'province': ''.join(tr.xpath('./td[2]/text()')).strip(),
+            'value': ''.join(tr.xpath('./td[3]/text()')).strip(),
        }
-        print(tr)
+    print(tr_data)
+    all_data.append(tr_data)
+print(all_data)