import threading from threading import Lock, Thread import time import os import requests import random class MyThread(threading.Thread): def __init__(self, name): super(MyThread, self).__init__() = name def run(self): global urls lock.acquire() one_url = urls.pop() print("正在爬取:", one_url) lock.release() print("任意线程等待随机时间") time.sleep(random.randint(1,3)) res = requests.get(one_url, headers=self.get_headers(), timeout=5) if res.json()["code"] != 400: data = res.json()["data"]["list"] for user in data: name = user['username'] nickname = self.remove_character(user['nickname']) userAvatar = user['userAvatar'] blogUrl = user['blogUrl'] blogExpert = user['blogExpert'] briefIntroduction = self.remove_character( user['briefIntroduction']) with open('./qing_gee_data.csv', 'a+', encoding='utf-8') as f: print( f'{name},{nickname},{userAvatar},{blogUrl},{blogExpert},{briefIntroduction}') f.write( f"{name},{nickname},{userAvatar},{blogUrl},{blogExpert},{briefIntroduction}\n") else: print(res.json()) print("异常数据", one_url) with open('./error.txt', 'a+', encoding='utf-8') as f: f.write(one_url+"\n") # 去除特殊字符 def remove_character(self, origin_str): if origin_str is None: return origin_str = origin_str.replace('\n', '') origin_str = origin_str.replace(',', ',') return origin_str def get_headers(self): uas = [ "Mozilla/5.0 (compatible; Baiduspider/2.0; +", "Mozilla/5.0 (compatible; Baiduspider-render/2.0; +", "Baiduspider-image+(+", "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 YisouSpider/5.0 Safari/537.36", "Mozilla/5.0 (compatible; Googlebot/2.1; +", "Mozilla/5.0 (compatible; Googlebot-Image/1.0; +", "Sogou web spider/4.0(+", "Sogou News Spider/4.0(+", "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0);", "Mozilla/5.0 (compatible; bingbot/2.0; +", "Sosospider+(+", "Mozilla/5.0 (compatible; Yahoo! Slurp China;" ] ua = random.choice(uas) headers = { "user-agent": ua, 'cookie': 'UserName=你的ID; UserInfo=你的UserInfo; UserToken=你的UserToken;', "referer": "" } return headers if __name__ == '__main__': lock = Lock() url_format = '{}&size=20&noMore=false&blogUsername=qing_gee' urls = [url_format.format(i) for i in range(1, 13300)] l = [] while len(urls) > 0: print(len(urls)) for i in range(5): p = MyThread("t"+str(i)) l.append(p) p.start() for p in l: p.join()