19LOU相亲爬虫.py 1.6 KB
Newer Older
H
hjCodeCloud 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53
import requests
from lxml import etree
from fake_useragent import UserAgent
import time


def save(src, title):
    try:
        res = requests.get(src)
        with open(f"imgs/{title}.jpg", "wb+") as f:
            f.write(res.content)
    except Exception as e:
        print(e)


def run(url):
    # ua = UserAgent(cache=False)
    ua = "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36"
    headers = {
        "User-Agent": ua,
        "Host": "www.19lou.com",
        "Referer": "https://www.19lou.com/r/1/19lnsxq-233.html",
        "Cookie": "_Z3nY0d4C_=37XgPK9h"  # 从反爬代码中获取到的值
    }
    try:
        res = requests.get(url=url, headers=headers)
        text = res.text
        # 将 html 转换成 Element 对象
        html = etree.HTML(text)
        # xpath 路径提取 @class 为选取 class 属性
        divs = html.xpath("//div[@class='pics']")
        # print(len(divs))
        # 遍历 Elements 节点
        for div in divs:
            # 提取地址,注意提取的属性为 data-src 而不是 src
            src = div.xpath("./img/@data-src")[0]
            # 提取标题
            title = div.xpath("./img/@alt")[0]
            save(src, title)
    except Exception as e:
        print(e)


if __name__ == '__main__':
    urls = ["https://www.19lou.com/r/1/19lnsxq.html"]
    for i in range(114, 243):
        urls.append(f"https://www.19lou.com/r/1/19lnsxq-{i}.html")
    for url in urls:
        print(f"正在抓取{url}")
        run(url)
        # time.sleep(5)

    print("全部爬取完毕")