黄鹤楼.py 1.5 KB
Newer Older
梦想橡皮擦's avatar
梦想橡皮擦 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58
import threading
import asyncio
import time
import requests
import lxml
from bs4 import BeautifulSoup


async def get(url):
    return requests.get(url)


async def get_html(url):
    print("准备抓取:", url)
    res = await get(url)
    return res.text


async def save_img(img_url):
    print("图片下载中:", img_url)
    res = await get(img_url)
    if res is not None:
        with open(f'./imgs/{time.time()}.jpg', 'wb') as f:
            f.write(res.content)
            return img_url,"ok"


async def main(url_list):
    # 创建 5 个任务
    tasks = [asyncio.ensure_future(get_html(url_list[_])) for _ in range(len(url_list))]

    dones, pending = await asyncio.wait(tasks)
    for task in dones:
        html = task.result()
        soup = BeautifulSoup(html, 'lxml')
        div_tag = soup.find(attrs={'class': 'lbox'})
        imgs = div_tag.find_all('img')


        for img in imgs:
            ret = await save_img(img["data-original"])
            print(ret)


if __name__ == '__main__':
    # 修改为黄鹤楼,测试方便,仅使用10页
    urls = [f"https://www.huanghelou.cc/category-44_{page}.html" for page in range(1, 10)]
    totle_page = len(urls) // 5 if len(urls) % 5 == 0 else len(urls) // 5 + 1
    # 对 urls 列表进行切片,方便采集
    for page in range(0, totle_page):
        start_page = 0 if page == 0 else page * 5
        end_page = (page + 1) * 5


        # 循环事件对象
        loop = asyncio.get_event_loop()

        loop.run_until_complete(main(urls[start_page:end_page]))