Fri May 19 08:52:00 UTC 2023 inscode

63c061a3 · 64478506f791091b8f791bc8 · df352508 · 63c061a3
隐藏空白更改
内联并排

Showing with 31 addition and 1 deletion

main.py main.py +31 -1

未找到文件。
--- a/main.py
+++ b/main.py
-print('欢迎来到 InsCode')
+import csv  #用于把爬取的数据存储为csv格式，可以excel直接打开的
\ No newline at end of file
+import time  #用于对请求加延时，爬取速度太快容易被反爬
+from time import sleep #同上
+import random  #用于对延时设置随机数，尽量模拟人的行为
+import requests  #用于向网站发送请求
+from lxml import etree
+import requests
+from bs4 import BeautifulSoup
+url = 'http://yz.yuzhuprice.com:8003/findPriceByName.jspx?page.curPage=1&priceName=%E7%BA%A2%E6%9C%A8%E7%B1%BB'
+headers = {
+    'User-Agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.129 Safari/537.36",
+}
+response = requests.get(url,headers=headers,timeout=10)
+# soup = BeautifulSoup(response.content, 'html.parser')
+# for link in soup.find_all('a'):
+#     print(link.get('href'))
+html = response.text
+parse = etree.HTML(html) 
+all_tr = parse.xpath('//*[@id="173200"]')
+for tr in all_tr:
+    tr = {
+        'name': ''.join(tr.xpath('./td[1]/text()')).strip(),
+        'price': ''.join(tr.xpath('./td[2]/text()')).strip(),
+        'unit': ''.join(tr.xpath('./td[3]/text()')).strip(),
+        'supermaket': ''.join(tr.xpath('./td[4]/text()')).strip(),
+        'time': ''.join(tr.xpath('./td[5]/text()')).strip()
+    }