From 63c061a3d594c209af9bce5015a7a819927f6a3e Mon Sep 17 00:00:00 2001 From: 64478506f791091b8f791bc8 <64478506f791091b8f791bc8@devide> Date: Fri, 19 May 2023 08:52:00 +0000 Subject: [PATCH] Fri May 19 08:52:00 UTC 2023 inscode --- main.py | 32 +++++++++++++++++++++++++++++++- 1 file changed, 31 insertions(+), 1 deletion(-) diff --git a/main.py b/main.py index 4c0c135..250bfda 100644 --- a/main.py +++ b/main.py @@ -1 +1,31 @@ -print('欢迎来到 InsCode') \ No newline at end of file +import csv #用于把爬取的数据存储为csv格式,可以excel直接打开的 +import time #用于对请求加延时,爬取速度太快容易被反爬 +from time import sleep #同上 +import random #用于对延时设置随机数,尽量模拟人的行为 +import requests #用于向网站发送请求 +from lxml import etree +import requests +from bs4 import BeautifulSoup + +url = 'http://yz.yuzhuprice.com:8003/findPriceByName.jspx?page.curPage=1&priceName=%E7%BA%A2%E6%9C%A8%E7%B1%BB' +headers = { + 'User-Agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.129 Safari/537.36", +} +response = requests.get(url,headers=headers,timeout=10) + +# soup = BeautifulSoup(response.content, 'html.parser') + +# for link in soup.find_all('a'): +# print(link.get('href')) +html = response.text +parse = etree.HTML(html) +all_tr = parse.xpath('//*[@id="173200"]') +for tr in all_tr: + tr = { + 'name': ''.join(tr.xpath('./td[1]/text()')).strip(), + 'price': ''.join(tr.xpath('./td[2]/text()')).strip(), + 'unit': ''.join(tr.xpath('./td[3]/text()')).strip(), + 'supermaket': ''.join(tr.xpath('./td[4]/text()')).strip(), + 'time': ''.join(tr.xpath('./td[5]/text()')).strip() + } + -- GitLab