站酷网用户爬虫.py 8.4 KB
Newer Older
梦想橡皮擦's avatar
梦想橡皮擦 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213
# -*- coding: UTF-8 -*-
import requests  # 网络请求模块
import random  # 随机模块
import re  # 正则表达式模块
import time  # 时间模块
import threading  # 线程模块
import pymongo as pm  # mongodb模块


class Config():
    def getHeaders(self):
        user_agent_list = [
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1"
            "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
            "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
            "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
            "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
            "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
            "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
            "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
            "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
            "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
            "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
            "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
            "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
        ]
        UserAgent = random.choice(user_agent_list)
        headers = {'User-Agent': UserAgent}
        return headers


# 起始种子地址
urls = ["https://douge2013.zcool.com.cn/follow?condition=0&p=1"]
index = 0  # 索引
g_lock = threading.Lock()  # 初始化一个锁

# 获取连接
client = pm.MongoClient('127.0.0.1', 27017)  # 端口号是数值型

# 连接目标数据库
db = client.zcool

# 数据库用户验证
db.authenticate("zcool", "zcool")
get_index = 0


# 生产者


class Producer(threading.Thread):

    def run(self):
        print("线程启动...")
        headers = Config().getHeaders()
        print(headers)
        global urls
        global index
        while True:
            g_lock.acquire()
            if len(urls) == 0:
                g_lock.release()
                continue
            page_url = urls.pop()
            g_lock.release()  # 使用完成之后及时把锁给释放,方便其他线程使用
            response = ""
            try:
                response = requests.get(page_url, headers=headers, timeout=5)

            except Exception as http:
                print("生产者异常")
                print(http)
                continue
            content = response.text
            # 如果是第一页,那么需要判断一下
            # print(page_url)
            is_home = re.search(r'\&p\=(\d+?)', page_url).group(1)

            if is_home == str(1):
                # 这个正则表达式看起来比较怪异,学习一下即可,重点为匹配换行
                pages = re.findall(
                    r'(\d+?)[.\s]*?<\/a>[.\s]*?<!\-\- 下一页 \-\->', content, re.S)  # 获取总页数
                page_size = 1
                if pages:
                    page_size = int(max(pages))  # 获取最大页数
                    if page_size > 1:  # 如果最大页数大于1,那么获取所有的页面
                        url_arr = []
                        threading_links_1 = []
                        for page in range(2, page_size+1):
                            url = re.sub(r'\&p\=(\d+?)', "&p=" +
                                         str(page), page_url)
                            threading_links_1.append(url)
                            g_lock.acquire()
                            index += 1
                            g_lock.release()

                            url_arr.append({"index": index, "link": url})

                        g_lock.acquire()
                        urls += threading_links_1  # URL数据添加
                        g_lock.release()
                        try:
                            db.text.insert_many(url_arr, ordered=False)
                        except Exception as e:
                            print("数据库输入异常")
                            print(e)
                            continue

                    else:
                        pass
                else:
                    pass

            rc = re.compile(
                r'<a href="(.*?)" title=".*?" class="avatar" target="_blank" z-st="member_content_card_1_user_face">')
            follows = rc.findall(content)
            # print(follows)
            fo_url = []
            threading_links_2 = []
            for u in follows:
                # 生成关注列表地址
                this_url = "%s/follow?condition=0&p=1" % u
                g_lock.acquire()
                index += 1
                g_lock.release()
                fo_url.append({"index": index, "link": this_url})
                threading_links_2.append(this_url)

            g_lock.acquire()
            urls += threading_links_2
            g_lock.release()
            # print(len(fo_url))
            try:
                db.text.insert_many(fo_url, ordered=False)
            except:
                continue



# 消费者类


class Consumer(threading.Thread):

    def run(self):
        headers = Config().getHeaders()

        global get_index
        while True:

            g_lock.acquire()
            get_index += 1
            g_lock.release()
            # 从刚才数据存储的列里面获取一条数据,这里用到find_one_and_delete方法
            # get_index 需要声明成全局的变量
            link = db.text.find_one_and_delete({"index": get_index})
            page_url = ""
            if link:
                page_url = link["link"]
                print(page_url+">>>网页分析中...")
            else:
                continue

            response = ""
            try:
                response = requests.get(page_url, headers=headers, timeout=5)

            except Exception as http:
                print("消费者有异常")
                print(http)
                continue

            content = response.text
            # rc = re.compile(r'divEditOperate_(?P<ID>\d*)[\"] .*>[\s\S]*?<p class=\"state\">.*?(?P<级别>\w*P).*</span></span>(?P<是否认证><br/>)?.*?</p>[\s\S]*?<div class=\"info clearfix\">[\s\S]*?<a class=\"imgBorder\" href=\"\/(?P<主页>.*?)\" hidefocus=\"true\">[\s\S]*?<img .*?src=\"(?P<头像>.*?)\".*?alt=\".*?\" title=\"(?P<昵称>.*?)\" />[\s\S]*?<p class=\"font12 lesserColor\">(?P<地点>.*?)&nbsp.*?<span class=\"font12 mainColor\">(?P<粉丝数目>\d*?)</span>')
            rc = re.compile(
                r'<div class="author-info" data-id="(?P<ID>\d+?)" data-name="(?P<NAME>.*?)">')
            user_info = rc.findall(content)
            print(">>>>>>>>>>>>>>>>>>>>")
            users = []
            for user in user_info:
                post = {
                    "id": user[0],
                    "name": user[1]
                }

                users.append(post)
            print(users)

            try:
                db.mkusers.insert_many(users, ordered=False)
            except Exception as e:
                print("数据库输入异常")
                print(e)
                continue

            time.sleep(1)

            print("<<<<<<<<<<<<<<<<<<<<")


if __name__ == "__main__":
    for i in range(5):
        p = Producer()
        p.start()

    for i in range(7):
        c = Consumer()
        c.start()