diff --git "a/NO34/imgs/\347\210\261\345\277\203\346\222\225\347\272\270\350\211\272\346\234\257\344\272\214\347\273\264\347\240\201.jpg" "b/NO34/imgs/\347\210\261\345\277\203\346\222\225\347\272\270\350\211\272\346\234\257\344\272\214\347\273\264\347\240\201.jpg" new file mode 100644 index 0000000000000000000000000000000000000000..4141eb0c7716ac3cf79f7bf640a8fb93925d8212 Binary files /dev/null and "b/NO34/imgs/\347\210\261\345\277\203\346\222\225\347\272\270\350\211\272\346\234\257\344\272\214\347\273\264\347\240\201.jpg" differ diff --git "a/NO34/\347\254\254\344\271\235\345\267\245\345\234\272.py" "b/NO34/\347\254\254\344\271\235\345\267\245\345\234\272.py" new file mode 100644 index 0000000000000000000000000000000000000000..71d34b6687190c8fd404c60cb413d848e8762084 --- /dev/null +++ "b/NO34/\347\254\254\344\271\235\345\267\245\345\234\272.py" @@ -0,0 +1,55 @@ +from bs4 import BeautifulSoup +import requests +import logging + +logging.basicConfig(level=logging.NOTSET) + + +def get_html(url, headers) -> None: + try: + res = requests.get(url=url, headers=headers, timeout=3) + except Exception as e: + logging.debug("采集异常", e) + + if res is not None: + html_str = res.text + soup = BeautifulSoup(html_str, "html.parser") + imgs = soup.find_all(attrs={'class': 'lazy'}) + print("获取到的数据量是", len(imgs)) + datas = [] + for item in imgs: + name = item.get('alt') + src = item["src"] + logging.info(f"{name},{src}") + # 获取拼接数据 + datas.append((name, src)) + save(datas, headers) + + +def save(datas, headers) -> None: + if datas is not None: + for item in datas: + try: + # 抓取图片 + res = requests.get(url=item[1], headers=headers, timeout=5) + except Exception as e: + + logging.debug("图片采集异常" + str(e)) + + if res is not None: + img_data = res.content + with open("./imgs/{}.jpg".format(item[0]), "wb+") as f: + f.write(img_data) + else: + return None + + +if __name__ == '__main__': + headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36", + "Referer": "https://www.9thws.com/" + } + url_format = "https://www.9thws.com/#p{}" + urls = [url_format.format(i) for i in range(2, 3)] + # 由于该网站是POST 请求,所以仅抓取一页,目的是测试 BS 的用法 + get_html(urls[0], headers)