提交 bbb6e2e1 编写于 作者: 梦想橡皮擦's avatar 梦想橡皮擦 💬

复盘案例

上级 705acb4c
import requests
from lxml import etree
import random
class Spider16:
def __init__(self):
self.wait_urls = ["https://www.qunzou.com/xuexi/list_1_1.html"]
self.url_template = "https://www.qunzou.com/xuexi/list_1_{num}.html"
self.details = []
def get_headers(self):
uas = [
"Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)",
"Mozilla/5.0 (compatible; Baiduspider-render/2.0; +http://www.baidu.com/search/spider.html)",
"Baiduspider-image+(+http://www.baidu.com/search/spider.htm)",
"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 YisouSpider/5.0 Safari/537.36",
"Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)",
"Mozilla/5.0 (compatible; Googlebot-Image/1.0; +http://www.google.com/bot.html)",
"Sogou web spider/4.0(+http://www.sogou.com/docs/help/webmasters.htm#07)",
"Sogou News Spider/4.0(+http://www.sogou.com/docs/help/webmasters.htm#07)",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0);",
"Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)",
"Sosospider+(+http://help.soso.com/webspider.htm)",
"Mozilla/5.0 (compatible; Yahoo! Slurp China; http://misc.yahoo.com.cn/help.html)"
]
ua = random.choice(uas)
headers = {
"user-agent": ua,
"referer": "https://www.baidu.com"
}
return headers
# 生成待爬取页面
def create_urls(self):
headers = self.get_headers()
page_url = self.wait_urls[0]
res = requests.get(url=page_url, headers=headers, timeout=5)
html = etree.HTML(res.text)
# 提取总页码
last_page = html.xpath("//span[@class='pageinfo']/strong[1]/text()")[0]
# 生成待爬取页面
for i in range(1, int(last_page) + 1):
self.wait_urls.append(self.url_template.format(num=i))
def get_html(self):
for url in self.wait_urls:
headers = self.get_headers()
res = requests.get(url, headers=headers, timeout=5)
if res:
html = etree.HTML(res.text)
detail_link_list = html.xpath("//div[@class='list']//h6/a/@href")
for d in detail_link_list:
self.details.append(f"https://www.qunzou.com{d}")
# 测试用,直接 return
return
def get_detail(self):
for url in self.details:
headers = self.get_headers()
res = requests.get(url, headers=headers, timeout=5)
res.encoding = "gb2312"
if res:
html = etree.HTML(res.text)
sentences = html.xpath("//div[@id='content']//p/text()")
# 打印句子
long_str = "\n".join(sentences)
print(long_str)
# with open("sentences.txt", "a+", encoding="utf-8") as f:
# f.write(long_str)
def run(self):
self.create_urls()
self.get_html()
self.get_detail()
if __name__ == '__main__':
s = Spider16()
s.run()
import requests
import re
import threading
import time
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36"}
# 详情页图片地址 URL
detail_urls = []
mutex = threading.Lock()
# 循环获取URL
def get_detail_urls(url):
res = requests.get(url=url, headers=headers)
res.encoding = 'gb2312'
if res is not None:
html = res.text # 读取页面源码
# 对目标源码页数据进行裁剪
# 获取 ul class = "g-gxlist-imgbox" 的数据
# 该数据在标签 <ul class="g-gxlist-imgbox"> 和 <div class="pagelist"> 之间
html = html[html.find('<ul class="g-gxlist-imgbox">'):html.find('<div class="pagelist">')]
# 裁剪之后的数据,可以使用正则提取
# 设置正则表达式对象
pattern = re.compile('<a href="(.*?)" target="_blank" title=".*?">')
# 提取详情页地址
find_urls = pattern.findall(html)
if find_urls:
# 上锁
mutex.acquire()
# 添加到全局变量中
detail_urls.extend(find_urls)
# 释放锁
mutex.release()
# 保存图片线程
def save_image():
global detail_urls
while True:
# 上锁
mutex.acquire()
if len(detail_urls) > 0:
# 获取列表第1项
img_url = detail_urls[0]
# 删除列表第1项
del detail_urls[0]
# 释放锁
mutex.release()
res = requests.get(url=img_url, headers=headers)
if res is not None:
html = res.text
# 裁切目标源码,便于后续整体提取
html = html[html.find('<div class="img-list3">'):html.find('<div class="m_ssxx">')]
pattern = re.compile('<img alt=".*?" src="(.*?)" />')
img_list = pattern.findall(html)
if img_list:
for img in img_list:
print(f"线程{threading.currentThread().name}", "抓取图片中:", img)
try:
res = requests.get(img)
with open(f"images/{threading.currentThread().name + str(time.time())}.png", "wb+") as f:
f.write(res.content)
except Exception as e:
print(e)
else:
print("等待中,长时间等待,可以直接关闭")
if __name__ == '__main__':
# 生成分页地址
origin_url = ['http://www.imeitou.com/nvsheng/']
for i in range(2, 11):
origin_url.append(f'http://www.imeitou.com/nvsheng/index_{i}.html')
# 获取图片详情页地址
for d_url in origin_url:
get_detail_urls(d_url)
# 测试得到的详情页地址列表
# 测试得到 160 条地址,数据量是正确的
print(len(detail_urls))
# 保存图片线程配置+启动
# 这里我们开启2个线程
save1 = threading.Thread(target=save_image)
save1.start()
save2 = threading.Thread(target=save_image)
save2.start()
# import requests
#
# response = requests.get("https://www.uisdc.com/archives")
# content = response.text
#
# with open("ca_demo.html", "w") as file:
# file.write(content)
import urllib.parse
decoded = urllib.parse.unquote("%3Ci+class%3D%22uname%22+title%3D%22%E4%BC%98%E7%A7%80%E7%BD%91%E9%A1%B5%E8%AE%BE%E8%AE%A1%22%3E%E4%BC%98%E7%A7%80%E7%BD%91%E9%A1%B5%E8%AE%BE%E8%AE%A1%3C%2Fi%3E")
print(decoded)
\ No newline at end of file
import threading
import asyncio
import time
import requests
import lxml
from bs4 import BeautifulSoup
async def get(url):
return requests.get(url)
async def get_html(url):
print("准备抓取:", url)
res = await get(url)
return res.text
async def save_img(img_url):
print("图片下载中:", img_url)
res = await get(img_url)
if res is not None:
with open(f'./imgs/{time.time()}.jpg', 'wb') as f:
f.write(res.content)
return img_url,"ok"
async def main(url_list):
# 创建 5 个任务
tasks = [asyncio.ensure_future(get_html(url_list[_])) for _ in range(len(url_list))]
dones, pending = await asyncio.wait(tasks)
for task in dones:
html = task.result()
soup = BeautifulSoup(html, 'lxml')
div_tag = soup.find(attrs={'class': 'lbox'})
imgs = div_tag.find_all('img')
for img in imgs:
ret = await save_img(img["data-original"])
print(ret)
if __name__ == '__main__':
# 修改为黄鹤楼,测试方便,仅使用10页
urls = [f"https://www.huanghelou.cc/category-44_{page}.html" for page in range(1, 10)]
totle_page = len(urls) // 5 if len(urls) % 5 == 0 else len(urls) // 5 + 1
# 对 urls 列表进行切片,方便采集
for page in range(0, totle_page):
start_page = 0 if page == 0 else page * 5
end_page = (page + 1) * 5
# 循环事件对象
loop = asyncio.get_event_loop()
loop.run_until_complete(main(urls[start_page:end_page]))
import requests
from lxml import etree
import random
class Spider16:
def __init__(self):
self.wait_urls = ["https://www.qunzou.com/xuexi/list_1_1.html"]
self.url_template = "https://www.qunzou.com/xuexi/list_1_{num}.html"
self.details = []
def get_headers(self):
uas = [
"Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)",
"Mozilla/5.0 (compatible; Baiduspider-render/2.0; +http://www.baidu.com/search/spider.html)",
"Baiduspider-image+(+http://www.baidu.com/search/spider.htm)",
"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 YisouSpider/5.0 Safari/537.36",
"Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)",
"Mozilla/5.0 (compatible; Googlebot-Image/1.0; +http://www.google.com/bot.html)",
"Sogou web spider/4.0(+http://www.sogou.com/docs/help/webmasters.htm#07)",
"Sogou News Spider/4.0(+http://www.sogou.com/docs/help/webmasters.htm#07)",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0);",
"Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)",
"Sosospider+(+http://help.soso.com/webspider.htm)",
"Mozilla/5.0 (compatible; Yahoo! Slurp China; http://misc.yahoo.com.cn/help.html)"
]
ua = random.choice(uas)
headers = {
"user-agent": ua,
"referer": "https://www.baidu.com"
}
return headers
# 生成待爬取页面
def create_urls(self):
headers = self.get_headers()
page_url = self.wait_urls[0]
res = requests.get(url=page_url, headers=headers, timeout=5)
html = etree.HTML(res.text)
# 提取总页码
last_page = html.xpath("//span[@class='pageinfo']/strong[1]/text()")[0]
# 生成待爬取页面
for i in range(1, int(last_page) + 1):
self.wait_urls.append(self.url_template.format(num=i))
def get_html(self):
for url in self.wait_urls:
headers = self.get_headers()
res = requests.get(url, headers=headers, timeout=5)
if res:
html = etree.HTML(res.text)
detail_link_list = html.xpath("//div[@class='list']//h6/a/@href")
for d in detail_link_list:
self.details.append(f"https://www.qunzou.com{d}")
# 测试用,直接 return
return
def get_detail(self):
for url in self.details:
headers = self.get_headers()
res = requests.get(url, headers=headers, timeout=5)
res.encoding = "gb2312"
if res:
html = etree.HTML(res.text)
sentences = html.xpath("//div[@id='content']//p/text()")
# 打印句子
long_str = "\n".join(sentences)
print(long_str)
# with open("sentences.txt", "a+", encoding="utf-8") as f:
# f.write(long_str)
def run(self):
self.create_urls()
self.get_html()
self.get_detail()
if __name__ == '__main__':
s = Spider16()
s.run()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册