index.py 2.8 KB
Newer Older
梦想橡皮擦's avatar
梦想橡皮擦 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80
from requests_html import HTMLSession, HTML
import random
import time
import os


def get_detail() -> list:
    # 待抓取的详情页地址
    wait_scrapy_urls = []
    # 声明一个 url 模板,用于批量生成待采集地址
    url_format = "http://landchina.mnr.gov.cn/land/crgg/gyyd/index_{}.htm"
    urls = ["http://landchina.mnr.gov.cn/land/crgg/gyyd/index.htm"]
    base_url = "http://landchina.mnr.gov.cn/land/crgg/gyyd/"
    # 测试只存储3页即可
    for page in range(1, 3):
        urls.append(url_format.format(page))
    for url in urls:
        try:
            res = session.get(url, timeout=3)
            res.html.encoding = "utf-8"
            # print(res.html.html)
            details = res.html.find("ul.gu-iconList>li>a")
            for detail in details:
                # http://landchina.mnr.gov.cn/land/crgg/gyyd/202109/t20210904_8081129.htm
                wait_scrapy_urls.append(base_url + detail.attrs['href'][2:])
        except Exception as e:
            print("采集分页数据异常", e)

        # 时间停留
        time.sleep(random.randint(1, 3))

    return wait_scrapy_urls


def save(index: int, url: str) -> str:
    try:
        print("正在采集:", url)
        res = session.get(url=url, timeout=3)
        res.html.encoding = "utf-8"
        with open(f"./htmls/{index}.html", "w+", encoding="utf-8") as f:
            f.write(res.html.html)
    except Exception as e:
        print("采集详情页数据异常", e)
        return save(index, url)


def analysis(html: str) -> list:
    return []


if __name__ == '__main__':
    session = HTMLSession()
    # 获取待抓取的详情页数据
    # scrapy_urls = get_detail()

    # 存储详情页HTML到本地,便于后续分析
    # for index, scrapy_url in enumerate(scrapy_urls):
    #     time.sleep(1)
    #     save(index, scrapy_url)

    # 提取数据
    file_names = os.listdir("./htmls/")
    for file in file_names:
        with open(f"./htmls/{file}", "r", encoding="utf-8") as f:
            html_content = f.read()

            html_doc = HTML(html=html_content)
            # zongdi = html_doc.xpath('//td[contains(text(),"宗地编号:")]/following-sibling::td[1]/text()')
            # #
            # mianji = html_doc.xpath('//td[contains(text(),"宗地总面积:")]/following-sibling::td[1]/text()')
            # for z in mianji:
            #     print(z.strip())
            # 宗地编号:

            zongdi = html_doc.xpath('//td[contains(text(),"宗地编号:")]/../../../table')
            print(file)
            for z in zongdi:
                card_id = z.xpath('.//td[contains(text(),"宗地编号:")]/following-sibling::td[1]/text()')[0].strip()
                mianji = z.xpath('.//td[contains(text(),"宗地总面积:")]/following-sibling::td[1]/text()')[0].strip()
                print(card_id, mianji)