from requests_html import HTMLSession, HTML import random import time import os def get_detail() -> list: # 待抓取的详情页地址 wait_scrapy_urls = [] # 声明一个 url 模板,用于批量生成待采集地址 url_format = "http://landchina.mnr.gov.cn/land/crgg/gyyd/index_{}.htm" urls = ["http://landchina.mnr.gov.cn/land/crgg/gyyd/index.htm"] base_url = "http://landchina.mnr.gov.cn/land/crgg/gyyd/" # 测试只存储3页即可 for page in range(1, 3): urls.append(url_format.format(page)) for url in urls: try: res = session.get(url, timeout=3) res.html.encoding = "utf-8" # print(res.html.html) details = res.html.find("ul.gu-iconList>li>a") for detail in details: # http://landchina.mnr.gov.cn/land/crgg/gyyd/202109/t20210904_8081129.htm wait_scrapy_urls.append(base_url + detail.attrs['href'][2:]) except Exception as e: print("采集分页数据异常", e) # 时间停留 time.sleep(random.randint(1, 3)) return wait_scrapy_urls def save(index: int, url: str) -> str: try: print("正在采集:", url) res = session.get(url=url, timeout=3) res.html.encoding = "utf-8" with open(f"./htmls/{index}.html", "w+", encoding="utf-8") as f: f.write(res.html.html) except Exception as e: print("采集详情页数据异常", e) return save(index, url) def analysis(html: str) -> list: return [] if __name__ == '__main__': session = HTMLSession() # 获取待抓取的详情页数据 # scrapy_urls = get_detail() # 存储详情页HTML到本地,便于后续分析 # for index, scrapy_url in enumerate(scrapy_urls): # time.sleep(1) # save(index, scrapy_url) # 提取数据 file_names = os.listdir("./htmls/") for file in file_names: with open(f"./htmls/{file}", "r", encoding="utf-8") as f: html_content = f.read() html_doc = HTML(html=html_content) # zongdi = html_doc.xpath('//td[contains(text(),"宗地编号:")]/following-sibling::td[1]/text()') # # # mianji = html_doc.xpath('//td[contains(text(),"宗地总面积:")]/following-sibling::td[1]/text()') # for z in mianji: # print(z.strip()) # 宗地编号: zongdi = html_doc.xpath('//td[contains(text(),"宗地编号:")]/../../../table') print(file) for z in zongdi: card_id = z.xpath('.//td[contains(text(),"宗地编号:")]/following-sibling::td[1]/text()')[0].strip() mianji = z.xpath('.//td[contains(text(),"宗地总面积:")]/following-sibling::td[1]/text()')[0].strip() print(card_id, mianji)