index.py 2.2 KB
Newer Older
H
hjCodeCloud 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78
import requests
import re
import time

# 声明 UA
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36"
}
# 存储异常路径,防止出现爬取失败情况
error_list = []

# 爬虫入口
def run():
    url = "http://www.ultramanclub.com/allultraman/"
    try:
        # 网页访问速度慢,需要设置 timeout
        res = requests.get(url=url, headers=headers, timeout=10)
        res.encoding = "gb2312"
        html = res.text
        return get_detail_list(html)

    except Exception as e:
        print("请求异常", e)


# 获取全部奥特曼详情页
def get_detail_list(html):
    start_index = '<ul class="lists">'
    start = html.find(start_index)
    html = html[start:]
    links = re.findall('<li class="item"><a href="(.*)">', html)
    # links = list(set(links))
    links = [
        f"http://www.ultramanclub.com/allultraman/{i.split('/')[1]}/" for i in set(links)]
    return links


def get_image(url):
    try:
        # 网页访问速度慢,需要设置 timeout
        res = requests.get(url=url, headers=headers, timeout=15)
        res.encoding = "gb2312"
        html = res.text
        print(url)
        # 获取详情页标题,作为图片文件名
        title = re.search('<title>(.*?)\[', html).group(1)
        # 获取图片短连接地址
        image_short = re.search(
            '<figure class="image tile">[.\s]*?<img src="(.*?)"', html).group(1)

        # 拼接完整图片地址
        img_url = "http://www.ultramanclub.com/allultraman/" + image_short[3:]
        # 获取图片数据
        img_data = requests.get(img_url).content
        print(f"正在爬取{title}")
        if title is not None and image_short is not None:
            with open(f"images/{title}.png", "wb") as f:
                f.write(img_data)

    except Exception as e:
        print("*"*100)
        print(url)
        print("请求异常", e)

        error_list.append(url)


if __name__ == '__main__':
    details = run()
    for detail in details:
        get_image(detail)

    while len(error_list) > 0:
        print("再次爬取")
        detail = error_list.pop()
        get_image(detail)

    print("奥特曼图片数据爬取完毕")