提交 39b65cef 编写于 作者: H hjCodeCloud

19lou相亲爬虫

上级 fb2f7913
import requests
from lxml import etree
from fake_useragent import UserAgent
import time
def save(src, title):
try:
res = requests.get(src)
with open(f"imgs/{title}.jpg", "wb+") as f:
f.write(res.content)
except Exception as e:
print(e)
def run(url):
# ua = UserAgent(cache=False)
ua = "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36"
headers = {
"User-Agent": ua,
"Host": "www.19lou.com",
"Referer": "https://www.19lou.com/r/1/19lnsxq-233.html",
"Cookie": "_Z3nY0d4C_=37XgPK9h" # 从反爬代码中获取到的值
}
try:
res = requests.get(url=url, headers=headers)
text = res.text
# 将 html 转换成 Element 对象
html = etree.HTML(text)
# xpath 路径提取 @class 为选取 class 属性
divs = html.xpath("//div[@class='pics']")
# print(len(divs))
# 遍历 Elements 节点
for div in divs:
# 提取地址,注意提取的属性为 data-src 而不是 src
src = div.xpath("./img/@data-src")[0]
# 提取标题
title = div.xpath("./img/@alt")[0]
save(src, title)
except Exception as e:
print(e)
if __name__ == '__main__':
urls = ["https://www.19lou.com/r/1/19lnsxq.html"]
for i in range(114, 243):
urls.append(f"https://www.19lou.com/r/1/19lnsxq-{i}.html")
for url in urls:
print(f"正在抓取{url}")
run(url)
# time.sleep(5)
print("全部爬取完毕")
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册