add commit author info crawler python script by DaXiaoQiang

ad67972f · Miykael_xxm · 78a1639d · ad67972f
隐藏空白更改
内联并排

Showing with 164 addition and 0 deletion

src/github_author_claw.py src/github_author_claw.py +164 -0

未找到文件。
--- a/src/github_author_claw.py
+++ b/src/github_author_claw.py
+#!/usr/bin/env pyhton3
+# coding=utf8
+# auhtor: @DaXiaoQiang
+
+#-------------------相关表结构----------------------------------
+#CREATE TABLE `repo` (
+#  `actor_email` varchar(255) DEFAULT NULL,
+# `sum_total` varchar(255) DEFAULT NULL,
+#  `any_repo_path` varchar(255) DEFAULT NULL,
+#  `any_commit_id` varchar(255) DEFAULT NULL,
+#  `gpt` int DEFAULT NULL,
+#  `avatar_url` varchar(255) DEFAULT NULL,
+#  `name` varchar(255) DEFAULT NULL,
+#  `company` varchar(255) DEFAULT NULL,
+#  `location` varchar(255) DEFAULT NULL,
+# `followers` int DEFAULT NULL,
+#  `author_id` varchar(50) DEFAULT NULL,
+#  `type` varchar(50) DEFAULT NULL,
+#  `login` varchar(255) DEFAULT NULL,
+#  `created_at` varchar(255) DEFAULT NULL,
+#  `updated_at` varchar(255) DEFAULT NULL,
+#  KEY `actor_email` (`actor_email`)
+#) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci;
+#------------------------------------------------------------------
+
+#运行环境python 2.7.18  mysql5.7
+
+from __future__ import division
+
+
+import os,requests,urllib,sys,time,json,urllib2,gzip,random
+import MySQLdb
+from StringIO import StringIO
+
+
+
+#下载一个github页面
+def api_github(myurl,github_token):
+	
+	#try: 
+		github_cookies='logged_in=no;tz=Asia%2FShanghai;_octo=GH1.1.693787499.1651047541;_gh_sess=zJpIR8dslqtlf9xuaA55ErkQoLjfVy3a8tiFpjF%2Fkr9%2F8VU%2BNjjujqysHHTC3lmOYUAhf1TEhky0CYq4XJPmwPi8duA1Eaot4Z%2FMIhE%2BPK6VDp4bO%2BUgqIlO3cbdAHSyc0R9Oynm%2FiRbL%2BuVV2Tt5lf2RVlUIfgN5FfCIodnJhozuuXCRXn7FSqMvR7KG9EX%2FQKktDHRYrnBosUpnV5JBIbZpYJ8qLuy5cReJSl8BlSYo8nqzKlYn3%2FO7%2BrwG916a4VjZPotuZ4c1C4yQZMgaA%3D%3D--JQ5IVzgrJsm%2F5gR7--HNRdOQU6i7zG1%2BK8LVeMVA%3D%3D;'
+		if os.path.exists('github_cookies.txt'):
+			with open("github_cookies.txt", "r") as f:  # 打开文件
+				github_cookies = f.read()  # 读取文件
+		github_cookies=github_cookies.strip()
+		req = urllib2.Request(myurl)
+		req.add_header("Cookie",github_cookies)
+		req.add_header("Authorization",'token '+github_token)
+		req.add_header('Accept-Encoding','gzip, deflate')
+		req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4844.82 Safari/537.36')
+		req.add_header("Referer","https://www.github.com/" )
+		res = urllib2.urlopen(req) 
+		if res.info().get('Content-Encoding') == 'gzip':
+			buf = StringIO(res.read())
+			f = gzip.GzipFile(fileobj=buf)
+			cp = f.read()
+		else:
+			cp = res.read()
+		#print cp
+		res.close()
+		return cp
+	#except:
+	#	print "error:"+myurl
+	#	return None
+
+if __name__=="__main__":
+	
+	conn=MySQLdb.connect(host="",user="",passwd="",db="",charset="utf8") #mysql连接信息
+	
+	cursor = conn.cursor()
+	
+	while 1:
+		#轮询github key列表
+		token_list=[""] # GitHub 的 PTAs
+		random.shuffle(token_list)
+		
+		#随机选取1000条未处理数据，方便多线程调度
+		sql='SELECT actor_email,any_repo_path,any_commit_id from repo where actor_email is not null and gpt is null order by rand() limit 1000'
+		cursor.execute(sql)
+		rows=cursor.fetchall()
+		x=0
+		for row in rows:
+			try:
+				actor_email=row[0]
+				any_repo_path=row[1]
+				any_commit_id=row[2]
+				
+				print (actor_email)
+				
+				#检查本记录是否已经被处理
+				sql='SELECT gpt from repo where actor_email="'+actor_email+'"'
+				cursor.execute(sql)
+				gpts=cursor.fetchone()
+				
+				if gpts[0]>0:
+					continue
+				
+				if x>=len(token_list)-1:
+					x=0
+				else:
+					x=x+1
+				
+				mytoken=token_list[x]
+				
+				print(mytoken)
+				print('https://api.github.com/repos/'+any_repo_path+'/commits/'+any_commit_id)
+				
+				
+				while 1:
+					try:
+						myjson=api_github('https://api.github.com/repos/'+any_repo_path+'/commits/'+any_commit_id,mytoken)
+						break
+					except Exception, e:
+						#过滤没有返回值的错误
+						if str(e).find('HTTP Error 422')>=0:
+							sql="update repo set gpt=%s where actor_email=%s" 
+							param=(2,actor_email)
+							cursor=conn.cursor()
+							n=cursor.execute(sql,param)
+							conn.commit()
+							break
+						print('----------------get url error---------------------------')
+						print(str(e))
+						print(mytoken)
+						print('https://api.github.com/repos/'+any_repo_path+'/commits/'+any_commit_id)
+						time.sleep(5)
+				
+				json_list=json.loads(myjson)
+				if json_list!=None:
+					author_url=json_list['author']["url"]
+					
+					myjson=api_github(author_url,mytoken)
+					json_list=json.loads(myjson)
+					if json_list!=None:
+						avatar_url=json_list['avatar_url']
+						type=json_list['type']
+						name=json_list['name']
+						company=json_list['company']
+						location=json_list['location']
+						followers=json_list['followers']
+						id=json_list['id']
+						login=json_list['login']
+						created_at=json_list['created_at']
+						updated_at=json_list['updated_at']
+						#数据保存到库
+						sql="update repo set avatar_url=%s,type=%s,name=%s,company=%s,location=%s,followers=%s,author_id=%s,login=%s,gpt=%s,created_at=%s,updated_at=%s where actor_email=%s" 
+						param=(avatar_url,type,name,company,location,followers,id,login,1,created_at,updated_at,actor_email)
+						cursor=conn.cursor()
+						n=cursor.execute(sql,param)
+						conn.commit()
+			except:
+				print('error')
+				#错误记录到库
+				sql="update repo set gpt=%s where actor_email=%s" 
+				param=(2,actor_email)
+				cursor=conn.cursor()
+				n=cursor.execute(sql,param)
+				conn.commit()
+			
+			
+		
+			
+	
+	
\ No newline at end of file