diff --git "a/\346\241\210\344\276\2133/\347\253\231\351\205\267\347\275\221\347\224\250\346\210\267\347\210\254\350\231\253.py" "b/\346\241\210\344\276\2133/\347\253\231\351\205\267\347\275\221\347\224\250\346\210\267\347\210\254\350\231\253.py" new file mode 100644 index 0000000000000000000000000000000000000000..bd8f68d91bca30437fd920ae893852039ab7aafc --- /dev/null +++ "b/\346\241\210\344\276\2133/\347\253\231\351\205\267\347\275\221\347\224\250\346\210\267\347\210\254\350\231\253.py" @@ -0,0 +1,213 @@ +# -*- coding: UTF-8 -*- +import requests # 网络请求模块 +import random # 随机模块 +import re # 正则表达式模块 +import time # 时间模块 +import threading # 线程模块 +import pymongo as pm # mongodb模块 + + +class Config(): + def getHeaders(self): + user_agent_list = [ + "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1" + "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11", + "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6", + "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6", + "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1", + "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5", + "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5", + "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", + "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", + "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", + "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", + "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", + "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", + "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", + "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3", + "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", + "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24" + ] + UserAgent = random.choice(user_agent_list) + headers = {'User-Agent': UserAgent} + return headers + + +# 起始种子地址 +urls = ["https://douge2013.zcool.com.cn/follow?condition=0&p=1"] +index = 0 # 索引 +g_lock = threading.Lock() # 初始化一个锁 + +# 获取连接 +client = pm.MongoClient('127.0.0.1', 27017) # 端口号是数值型 + +# 连接目标数据库 +db = client.zcool + +# 数据库用户验证 +db.authenticate("zcool", "zcool") +get_index = 0 + + +# 生产者 + + +class Producer(threading.Thread): + + def run(self): + print("线程启动...") + headers = Config().getHeaders() + print(headers) + global urls + global index + while True: + g_lock.acquire() + if len(urls) == 0: + g_lock.release() + continue + page_url = urls.pop() + g_lock.release() # 使用完成之后及时把锁给释放,方便其他线程使用 + response = "" + try: + response = requests.get(page_url, headers=headers, timeout=5) + + except Exception as http: + print("生产者异常") + print(http) + continue + content = response.text + # 如果是第一页,那么需要判断一下 + # print(page_url) + is_home = re.search(r'\&p\=(\d+?)', page_url).group(1) + + if is_home == str(1): + # 这个正则表达式看起来比较怪异,学习一下即可,重点为匹配换行 + pages = re.findall( + r'(\d+?)[.\s]*?<\/a>[.\s]*?', content, re.S) # 获取总页数 + page_size = 1 + if pages: + page_size = int(max(pages)) # 获取最大页数 + if page_size > 1: # 如果最大页数大于1,那么获取所有的页面 + url_arr = [] + threading_links_1 = [] + for page in range(2, page_size+1): + url = re.sub(r'\&p\=(\d+?)', "&p=" + + str(page), page_url) + threading_links_1.append(url) + g_lock.acquire() + index += 1 + g_lock.release() + + url_arr.append({"index": index, "link": url}) + + g_lock.acquire() + urls += threading_links_1 # URL数据添加 + g_lock.release() + try: + db.text.insert_many(url_arr, ordered=False) + except Exception as e: + print("数据库输入异常") + print(e) + continue + + else: + pass + else: + pass + + rc = re.compile( + r'') + follows = rc.findall(content) + # print(follows) + fo_url = [] + threading_links_2 = [] + for u in follows: + # 生成关注列表地址 + this_url = "%s/follow?condition=0&p=1" % u + g_lock.acquire() + index += 1 + g_lock.release() + fo_url.append({"index": index, "link": this_url}) + threading_links_2.append(this_url) + + g_lock.acquire() + urls += threading_links_2 + g_lock.release() + # print(len(fo_url)) + try: + db.text.insert_many(fo_url, ordered=False) + except: + continue + + + +# 消费者类 + + +class Consumer(threading.Thread): + + def run(self): + headers = Config().getHeaders() + + global get_index + while True: + + g_lock.acquire() + get_index += 1 + g_lock.release() + # 从刚才数据存储的列里面获取一条数据,这里用到find_one_and_delete方法 + # get_index 需要声明成全局的变量 + link = db.text.find_one_and_delete({"index": get_index}) + page_url = "" + if link: + page_url = link["link"] + print(page_url+">>>网页分析中...") + else: + continue + + response = "" + try: + response = requests.get(page_url, headers=headers, timeout=5) + + except Exception as http: + print("消费者有异常") + print(http) + continue + + content = response.text + # rc = re.compile(r'divEditOperate_(?P\d*)[\"] .*>[\s\S]*?

.*?(?P<级别>\w*P).*(?P<是否认证>
)?.*?

[\s\S]*?
[\s\S]*?.*?)\" hidefocus=\"true\">[\s\S]*?.*?)\".*?alt=\".*?\" title=\"(?P<昵称>.*?)\" />[\s\S]*?

(?P<地点>.*?) .*?(?P<粉丝数目>\d*?)') + rc = re.compile( + r'

') + user_info = rc.findall(content) + print(">>>>>>>>>>>>>>>>>>>>") + users = [] + for user in user_info: + post = { + "id": user[0], + "name": user[1] + } + + users.append(post) + print(users) + + try: + db.mkusers.insert_many(users, ordered=False) + except Exception as e: + print("数据库输入异常") + print(e) + continue + + time.sleep(1) + + print("<<<<<<<<<<<<<<<<<<<<") + + +if __name__ == "__main__": + for i in range(5): + p = Producer() + p.start() + + for i in range(7): + c = Consumer() + c.start()