From ba21aa3f2e134fb2d06d32ec69642462d9a2e53a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=A2=A6=E6=83=B3=E6=A9=A1=E7=9A=AE=E6=93=A6?= Date: Sun, 6 Jun 2021 17:39:55 +0800 Subject: [PATCH] =?UTF-8?q?=E5=8D=83=E7=8C=AB=E5=9B=BE?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- NO3/third.py | 95 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 95 insertions(+) create mode 100644 NO3/third.py diff --git a/NO3/third.py b/NO3/third.py new file mode 100644 index 0000000..33d77d7 --- /dev/null +++ b/NO3/third.py @@ -0,0 +1,95 @@ +import requests +import re + +HEADERS = { + "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36" +} + + +# 获取分页 +def get_pagesize(html): + # 编写简单的正则表达式 末页 + pagesize = re.search("末页", html) + if pagesize is not None: + return pagesize.group(1) + else: + return 0 + + +# 获取待抓取列表 +def get_wait_list(url): + wait_urls = [] + try: + res = requests.get(url=url, headers=HEADERS, timeout=5) + res.encoding = "gb2312" + html_text = res.text + pagesize = int(get_pagesize(html_text)) + if pagesize > 0: + print(f"获取到{pagesize}页数据") + # 生成待抓取列表 + for i in range(1, pagesize + 1): + wait_urls.append(f"http://p.ik123.com/zt/maomi/68_{i}.html") + return wait_urls + + except Exception as e: + print("获取分页异常", e) + + +# 正则匹配详情页链接 +def format_detail(html): + # 多次模拟得到正则表达式 + mao_img_urls = re.findall('.*?', html) + return mao_img_urls + + +# 获取猫咪图片地址 +def get_mao_img(detail_url): + try: + res = requests.get(url=detail_url, headers=HEADERS, timeout=5) + res.encoding = "gb2312" + html_text = res.text + return format_mao_img(html_text) + + except Exception as e: + print("获取猫咪图片异常", e) + + +if __name__ == '__main__': + start_url = "http://p.ik123.com/zt/maomi/68_1.html" + wait_urls = get_wait_list(url=start_url) + detail_list = [] + for url in wait_urls: + print(f"正在抓取{url}") + detail_list.extend(get_detail_list(url)) + + print(f"获取到{len(detail_list)}条详情页") + mao_imgs = [] + for index, mao_detail in enumerate(detail_list): + if len(mao_detail) > 0: + print(f"正抓取第{index}页数据") + mao_imgs.extend(get_mao_img(mao_detail)) + # 以下代码测试用 + if len(mao_imgs) > 100: + break + + print(f"获取到{len(mao_imgs)}条猫咪图") + print(mao_imgs[:5]) -- GitLab