From 701e559f26ba750c3d97792b9cf04743cf1e0b32 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=A2=A6=E6=83=B3=E6=A9=A1=E7=9A=AE=E6=93=A6?= Date: Mon, 23 Aug 2021 15:16:54 +0800 Subject: [PATCH] =?UTF-8?q?=E4=B8=80=E8=B7=AF=E5=95=86=E6=9C=BA=E7=BD=91?= =?UTF-8?q?=E5=8A=A0=E7=9B=9F=E6=95=B0=E6=8D=AE=E9=87=87=E9=9B=86?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ...60\346\215\256\351\207\207\351\233\206.py" | 129 ++++++++++++++++++ ...0\346\224\276\345\234\260\345\235\200.txt" | 0 2 files changed, 129 insertions(+) create mode 100644 "NO20/\344\270\200\350\267\257\345\225\206\346\234\272\347\275\221\345\212\240\347\233\237\346\225\260\346\215\256\351\207\207\351\233\206.py" create mode 100644 "NO20/\345\212\240\347\233\237\347\275\221\347\253\231\346\225\260\346\215\256\345\214\205/HTML\346\226\207\344\273\266\345\255\230\346\224\276\345\234\260\345\235\200.txt" diff --git "a/NO20/\344\270\200\350\267\257\345\225\206\346\234\272\347\275\221\345\212\240\347\233\237\346\225\260\346\215\256\351\207\207\351\233\206.py" "b/NO20/\344\270\200\350\267\257\345\225\206\346\234\272\347\275\221\345\212\240\347\233\237\346\225\260\346\215\256\351\207\207\351\233\206.py" new file mode 100644 index 0000000..3b30d22 --- /dev/null +++ "b/NO20/\344\270\200\350\267\257\345\225\206\346\234\272\347\275\221\345\212\240\347\233\237\346\225\260\346\215\256\351\207\207\351\233\206.py" @@ -0,0 +1,129 @@ +import requests +from lxml.html import etree +import random +import time + + +class SSS: + def __init__(self): + self.start_url = 'http://xiangmu.1637.com/p1.html' + self.url_format = 'http://xiangmu.1637.com/p{}.html' + self.session = requests.Session() + self.headers = self.get_headers() + + def get_headers(self): + uas = [ + "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)", + "Mozilla/5.0 (compatible; Baiduspider-render/2.0; +http://www.baidu.com/search/spider.html)", + "Baiduspider-image+(+http://www.baidu.com/search/spider.htm)", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 YisouSpider/5.0 Safari/537.36", + "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)", + "Mozilla/5.0 (compatible; Googlebot-Image/1.0; +http://www.google.com/bot.html)", + "Sogou web spider/4.0(+http://www.sogou.com/docs/help/webmasters.htm#07)", + "Sogou News Spider/4.0(+http://www.sogou.com/docs/help/webmasters.htm#07)", + "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0);", + "Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)", + "Sosospider+(+http://help.soso.com/webspider.htm)", + "Mozilla/5.0 (compatible; Yahoo! Slurp China; http://misc.yahoo.com.cn/help.html)" + ] + ua = random.choice(uas) + headers = { + "user-agent": ua, + "referer": "https://www.baidu.com" + } + return headers + + def get_pagesize(self): + + with self.session.get(url=self.start_url, headers=self.headers, timeout=5) as res: + if res.text: + element = etree.HTML(res.text) + # 通过 cssselect 选择器,选择 em 标签 + div_total = element.cssselect('#div_total>em') + # 获取 em 标签内部文本 div_total[0].text,并将其转换为整数 + total = int(div_total[0].text) + # 获取页码 + pagesize = int(total / 10) + 1 + # print(pagesize) + # 总数恰好被10整数,不用额外增加一页数据 + if total % 10 == 0: + pagesize = int(total / 10) + + return pagesize + else: + return None + + def get_detail(self, page): + with self.session.get(url=self.url_format.format(page), headers=self.headers, timeout=5) as res: + if res.text: + with open(f"./加盟网站数据包/{page}.html", "w+", encoding="utf-8") as f: + f.write(res.text) + else: + # 如果无数据,重新请求 + print(f"页码{page}请求异常,重新请求") + self.get_detail(page) + + def run(self): + pagesize = self.get_pagesize() + # 测试数据,可临时修改 pagesize = 20 + for page in range(1, pagesize): + self.get_detail(page) + time.sleep(2) + print(f"页码{page}抓取完毕!") + + +# 数据提取类 +class Analysis: + def __init__(self): + pass + + # 去除特殊字符 + def remove_character(self, origin_str): + if origin_str is None: + return + origin_str = origin_str.replace('\n', '') + origin_str = origin_str.replace(',', ',') + return origin_str + + def format(self, text): + html = etree.HTML(text) + # 获取所有项目区域 div + div_xminfos = html.cssselect('div.xminfo') + for xm in div_xminfos: + adtexts = self.remove_character(xm.cssselect('a.adtxt')[0].text) # 获取广告词列表 + url = xm.cssselect('a.adtxt')[0].attrib.get('href') # 获取详情页地址 + + brands = xm.cssselect(':nth-child(2)>:nth-child(2)')[1].text # 获取品牌列表 + categorys = xm.cssselect(':nth-child(2)>:nth-child(3)>a')[0].text # 获取分类,例如 ["餐饮","小吃"] + types = '' + try: + # 此处可能不存在二级分类 + types = xm.cssselect(':nth-child(2)>:nth-child(3)>a')[1].text # 获取分类,例如 ["餐饮","小吃"] + except Exception as e: + pass + creation = xm.cssselect(':nth-child(2)>:nth-child(6)')[0].text # 品牌建立时间列表 + franchise = xm.cssselect(':nth-child(2)>:nth-child(9)')[0].text # 加盟店数量列表 + company = xm.cssselect(':nth-child(3)>span>a')[0].text # 公司名称列表 + + introduce = self.remove_character(xm.cssselect(':nth-child(4)>span')[0].text) # 品牌介绍 + pros = self.remove_character(xm.cssselect(':nth-child(5)>:nth-child(2)')[0].text) # 经营产品介绍 + investment = xm.cssselect(':nth-child(5)>:nth-child(4)>em')[0].text # 投资金额 + # 拼接字符串 + long_str = f"{adtexts},{categorys},{types},{brands},{creation},{franchise},{company},{introduce},{pros},{investment},{url}" + with open("./加盟数据.csv", "a+", encoding="utf-8") as f: + f.write(long_str + "\n") + + def run(self): + for i in range(1, 5704): + with open(f"./加盟网站数据包/{i}.html", "r", encoding="utf-8") as f: + text = f.read() + self.format(text) + + +if __name__ == '__main__': + # 采集数据,运行哪部分,去除注释即可 + # s = SSS() + # s.run() + # 提取数据 + a = Analysis() + a.run() \ No newline at end of file diff --git "a/NO20/\345\212\240\347\233\237\347\275\221\347\253\231\346\225\260\346\215\256\345\214\205/HTML\346\226\207\344\273\266\345\255\230\346\224\276\345\234\260\345\235\200.txt" "b/NO20/\345\212\240\347\233\237\347\275\221\347\253\231\346\225\260\346\215\256\345\214\205/HTML\346\226\207\344\273\266\345\255\230\346\224\276\345\234\260\345\235\200.txt" new file mode 100644 index 0000000..e69de29 -- GitLab