import csv #用于把爬取的数据存储为csv格式,可以excel直接打开的 import time #用于对请求加延时,爬取速度太快容易被反爬 from time import sleep #同上 import random #用于对延时设置随机数,尽量模拟人的行为 import requests #用于向网站发送请求 from lxml import etree import requests from bs4 import BeautifulSoup headers = { 'User-Agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.129 Safari/537.36", } url = 'http://www.shijie500qiang.com/m/view.php?aid=47' response = requests.get(url,headers=headers,timeout=10) # soup = BeautifulSoup(response.content, 'html.parser') # for link in soup.find_all('a'): # print(link.get('href')) html = response.text # print(html) parse = etree.HTML(html) all_tr = parse.xpath('//*[@id="modellist-2894592"]/div/div/table/tbody/tr[position()>1]') all_data=[] for tr in all_tr: tr_data={} if tr.xpath("./td/span"): tr_data = { "sort":''.join(tr.xpath('./td[1]/text()')).strip(), 'province': ''.join(tr.xpath('./td[2]/span/text()')).strip(), 'value': ''.join(tr.xpath('./td[3]/text()')).strip(), } else: tr_data = { "sort": ''.join(tr.xpath('./td[1]/text()')).strip(), 'province': ''.join(tr.xpath('./td[2]/text()')).strip(), 'value': ''.join(tr.xpath('./td[3]/text()')).strip(), } print(tr_data) all_data.append(tr_data) print(all_data)