From ad67972f3f8e522c4a151d7047c0b2134af9f54a Mon Sep 17 00:00:00 2001 From: Miykaelxxm Date: Thu, 13 Oct 2022 15:04:47 +0800 Subject: [PATCH] add commit author info crawler python script by DaXiaoQiang --- src/github_author_claw.py | 164 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 164 insertions(+) create mode 100644 src/github_author_claw.py diff --git a/src/github_author_claw.py b/src/github_author_claw.py new file mode 100644 index 0000000..49ec6d8 --- /dev/null +++ b/src/github_author_claw.py @@ -0,0 +1,164 @@ +#!/usr/bin/env pyhton3 +# coding=utf8 +# auhtor: @DaXiaoQiang + +#-------------------相关表结构---------------------------------- +#CREATE TABLE `repo` ( +# `actor_email` varchar(255) DEFAULT NULL, +# `sum_total` varchar(255) DEFAULT NULL, +# `any_repo_path` varchar(255) DEFAULT NULL, +# `any_commit_id` varchar(255) DEFAULT NULL, +# `gpt` int DEFAULT NULL, +# `avatar_url` varchar(255) DEFAULT NULL, +# `name` varchar(255) DEFAULT NULL, +# `company` varchar(255) DEFAULT NULL, +# `location` varchar(255) DEFAULT NULL, +# `followers` int DEFAULT NULL, +# `author_id` varchar(50) DEFAULT NULL, +# `type` varchar(50) DEFAULT NULL, +# `login` varchar(255) DEFAULT NULL, +# `created_at` varchar(255) DEFAULT NULL, +# `updated_at` varchar(255) DEFAULT NULL, +# KEY `actor_email` (`actor_email`) +#) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci; +#------------------------------------------------------------------ + +#运行环境python 2.7.18 mysql5.7 + +from __future__ import division + + +import os,requests,urllib,sys,time,json,urllib2,gzip,random +import MySQLdb +from StringIO import StringIO + + + +#下载一个github页面 +def api_github(myurl,github_token): + + #try: + github_cookies='logged_in=no;tz=Asia%2FShanghai;_octo=GH1.1.693787499.1651047541;_gh_sess=zJpIR8dslqtlf9xuaA55ErkQoLjfVy3a8tiFpjF%2Fkr9%2F8VU%2BNjjujqysHHTC3lmOYUAhf1TEhky0CYq4XJPmwPi8duA1Eaot4Z%2FMIhE%2BPK6VDp4bO%2BUgqIlO3cbdAHSyc0R9Oynm%2FiRbL%2BuVV2Tt5lf2RVlUIfgN5FfCIodnJhozuuXCRXn7FSqMvR7KG9EX%2FQKktDHRYrnBosUpnV5JBIbZpYJ8qLuy5cReJSl8BlSYo8nqzKlYn3%2FO7%2BrwG916a4VjZPotuZ4c1C4yQZMgaA%3D%3D--JQ5IVzgrJsm%2F5gR7--HNRdOQU6i7zG1%2BK8LVeMVA%3D%3D;' + if os.path.exists('github_cookies.txt'): + with open("github_cookies.txt", "r") as f: # 打开文件 + github_cookies = f.read() # 读取文件 + github_cookies=github_cookies.strip() + req = urllib2.Request(myurl) + req.add_header("Cookie",github_cookies) + req.add_header("Authorization",'token '+github_token) + req.add_header('Accept-Encoding','gzip, deflate') + req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4844.82 Safari/537.36') + req.add_header("Referer","https://www.github.com/" ) + res = urllib2.urlopen(req) + if res.info().get('Content-Encoding') == 'gzip': + buf = StringIO(res.read()) + f = gzip.GzipFile(fileobj=buf) + cp = f.read() + else: + cp = res.read() + #print cp + res.close() + return cp + #except: + # print "error:"+myurl + # return None + +if __name__=="__main__": + + conn=MySQLdb.connect(host="",user="",passwd="",db="",charset="utf8") #mysql连接信息 + + cursor = conn.cursor() + + while 1: + #轮询github key列表 + token_list=[""] # GitHub 的 PTAs + random.shuffle(token_list) + + #随机选取1000条未处理数据,方便多线程调度 + sql='SELECT actor_email,any_repo_path,any_commit_id from repo where actor_email is not null and gpt is null order by rand() limit 1000' + cursor.execute(sql) + rows=cursor.fetchall() + x=0 + for row in rows: + try: + actor_email=row[0] + any_repo_path=row[1] + any_commit_id=row[2] + + print (actor_email) + + #检查本记录是否已经被处理 + sql='SELECT gpt from repo where actor_email="'+actor_email+'"' + cursor.execute(sql) + gpts=cursor.fetchone() + + if gpts[0]>0: + continue + + if x>=len(token_list)-1: + x=0 + else: + x=x+1 + + mytoken=token_list[x] + + print(mytoken) + print('https://api.github.com/repos/'+any_repo_path+'/commits/'+any_commit_id) + + + while 1: + try: + myjson=api_github('https://api.github.com/repos/'+any_repo_path+'/commits/'+any_commit_id,mytoken) + break + except Exception, e: + #过滤没有返回值的错误 + if str(e).find('HTTP Error 422')>=0: + sql="update repo set gpt=%s where actor_email=%s" + param=(2,actor_email) + cursor=conn.cursor() + n=cursor.execute(sql,param) + conn.commit() + break + print('----------------get url error---------------------------') + print(str(e)) + print(mytoken) + print('https://api.github.com/repos/'+any_repo_path+'/commits/'+any_commit_id) + time.sleep(5) + + json_list=json.loads(myjson) + if json_list!=None: + author_url=json_list['author']["url"] + + myjson=api_github(author_url,mytoken) + json_list=json.loads(myjson) + if json_list!=None: + avatar_url=json_list['avatar_url'] + type=json_list['type'] + name=json_list['name'] + company=json_list['company'] + location=json_list['location'] + followers=json_list['followers'] + id=json_list['id'] + login=json_list['login'] + created_at=json_list['created_at'] + updated_at=json_list['updated_at'] + #数据保存到库 + sql="update repo set avatar_url=%s,type=%s,name=%s,company=%s,location=%s,followers=%s,author_id=%s,login=%s,gpt=%s,created_at=%s,updated_at=%s where actor_email=%s" + param=(avatar_url,type,name,company,location,followers,id,login,1,created_at,updated_at,actor_email) + cursor=conn.cursor() + n=cursor.execute(sql,param) + conn.commit() + except: + print('error') + #错误记录到库 + sql="update repo set gpt=%s where actor_email=%s" + param=(2,actor_email) + cursor=conn.cursor() + n=cursor.execute(sql,param) + conn.commit() + + + + + + \ No newline at end of file -- GitLab