“01396d91dbe90e73637993017c6b6418bd893ea9”上不存在“...fluid/git@gitcode.net:s920243400/PaddleDetection.git”
提交 ad67972f 编写于 作者: Miykael_xxm's avatar Miykael_xxm 🚴

add commit author info crawler python script by DaXiaoQiang

上级 78a1639d
#!/usr/bin/env pyhton3
# coding=utf8
# auhtor: @DaXiaoQiang
#-------------------相关表结构----------------------------------
#CREATE TABLE `repo` (
# `actor_email` varchar(255) DEFAULT NULL,
# `sum_total` varchar(255) DEFAULT NULL,
# `any_repo_path` varchar(255) DEFAULT NULL,
# `any_commit_id` varchar(255) DEFAULT NULL,
# `gpt` int DEFAULT NULL,
# `avatar_url` varchar(255) DEFAULT NULL,
# `name` varchar(255) DEFAULT NULL,
# `company` varchar(255) DEFAULT NULL,
# `location` varchar(255) DEFAULT NULL,
# `followers` int DEFAULT NULL,
# `author_id` varchar(50) DEFAULT NULL,
# `type` varchar(50) DEFAULT NULL,
# `login` varchar(255) DEFAULT NULL,
# `created_at` varchar(255) DEFAULT NULL,
# `updated_at` varchar(255) DEFAULT NULL,
# KEY `actor_email` (`actor_email`)
#) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci;
#------------------------------------------------------------------
#运行环境python 2.7.18 mysql5.7
from __future__ import division
import os,requests,urllib,sys,time,json,urllib2,gzip,random
import MySQLdb
from StringIO import StringIO
#下载一个github页面
def api_github(myurl,github_token):
#try:
github_cookies='logged_in=no;tz=Asia%2FShanghai;_octo=GH1.1.693787499.1651047541;_gh_sess=zJpIR8dslqtlf9xuaA55ErkQoLjfVy3a8tiFpjF%2Fkr9%2F8VU%2BNjjujqysHHTC3lmOYUAhf1TEhky0CYq4XJPmwPi8duA1Eaot4Z%2FMIhE%2BPK6VDp4bO%2BUgqIlO3cbdAHSyc0R9Oynm%2FiRbL%2BuVV2Tt5lf2RVlUIfgN5FfCIodnJhozuuXCRXn7FSqMvR7KG9EX%2FQKktDHRYrnBosUpnV5JBIbZpYJ8qLuy5cReJSl8BlSYo8nqzKlYn3%2FO7%2BrwG916a4VjZPotuZ4c1C4yQZMgaA%3D%3D--JQ5IVzgrJsm%2F5gR7--HNRdOQU6i7zG1%2BK8LVeMVA%3D%3D;'
if os.path.exists('github_cookies.txt'):
with open("github_cookies.txt", "r") as f: # 打开文件
github_cookies = f.read() # 读取文件
github_cookies=github_cookies.strip()
req = urllib2.Request(myurl)
req.add_header("Cookie",github_cookies)
req.add_header("Authorization",'token '+github_token)
req.add_header('Accept-Encoding','gzip, deflate')
req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4844.82 Safari/537.36')
req.add_header("Referer","https://www.github.com/" )
res = urllib2.urlopen(req)
if res.info().get('Content-Encoding') == 'gzip':
buf = StringIO(res.read())
f = gzip.GzipFile(fileobj=buf)
cp = f.read()
else:
cp = res.read()
#print cp
res.close()
return cp
#except:
# print "error:"+myurl
# return None
if __name__=="__main__":
conn=MySQLdb.connect(host="",user="",passwd="",db="",charset="utf8") #mysql连接信息
cursor = conn.cursor()
while 1:
#轮询github key列表
token_list=[""] # GitHub 的 PTAs
random.shuffle(token_list)
#随机选取1000条未处理数据,方便多线程调度
sql='SELECT actor_email,any_repo_path,any_commit_id from repo where actor_email is not null and gpt is null order by rand() limit 1000'
cursor.execute(sql)
rows=cursor.fetchall()
x=0
for row in rows:
try:
actor_email=row[0]
any_repo_path=row[1]
any_commit_id=row[2]
print (actor_email)
#检查本记录是否已经被处理
sql='SELECT gpt from repo where actor_email="'+actor_email+'"'
cursor.execute(sql)
gpts=cursor.fetchone()
if gpts[0]>0:
continue
if x>=len(token_list)-1:
x=0
else:
x=x+1
mytoken=token_list[x]
print(mytoken)
print('https://api.github.com/repos/'+any_repo_path+'/commits/'+any_commit_id)
while 1:
try:
myjson=api_github('https://api.github.com/repos/'+any_repo_path+'/commits/'+any_commit_id,mytoken)
break
except Exception, e:
#过滤没有返回值的错误
if str(e).find('HTTP Error 422')>=0:
sql="update repo set gpt=%s where actor_email=%s"
param=(2,actor_email)
cursor=conn.cursor()
n=cursor.execute(sql,param)
conn.commit()
break
print('----------------get url error---------------------------')
print(str(e))
print(mytoken)
print('https://api.github.com/repos/'+any_repo_path+'/commits/'+any_commit_id)
time.sleep(5)
json_list=json.loads(myjson)
if json_list!=None:
author_url=json_list['author']["url"]
myjson=api_github(author_url,mytoken)
json_list=json.loads(myjson)
if json_list!=None:
avatar_url=json_list['avatar_url']
type=json_list['type']
name=json_list['name']
company=json_list['company']
location=json_list['location']
followers=json_list['followers']
id=json_list['id']
login=json_list['login']
created_at=json_list['created_at']
updated_at=json_list['updated_at']
#数据保存到库
sql="update repo set avatar_url=%s,type=%s,name=%s,company=%s,location=%s,followers=%s,author_id=%s,login=%s,gpt=%s,created_at=%s,updated_at=%s where actor_email=%s"
param=(avatar_url,type,name,company,location,followers,id,login,1,created_at,updated_at,actor_email)
cursor=conn.cursor()
n=cursor.execute(sql,param)
conn.commit()
except:
print('error')
#错误记录到库
sql="update repo set gpt=%s where actor_email=%s"
param=(2,actor_email)
cursor=conn.cursor()
n=cursor.execute(sql,param)
conn.commit()
\ No newline at end of file
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册