#!/usr/bin/env pyhton3 # coding=utf8 # auhtor: @DaXiaoQiang #-------------------相关表结构---------------------------------- #CREATE TABLE `repo` ( # `actor_email` varchar(255) DEFAULT NULL, # `sum_total` varchar(255) DEFAULT NULL, # `any_repo_path` varchar(255) DEFAULT NULL, # `any_commit_id` varchar(255) DEFAULT NULL, # `gpt` int DEFAULT NULL, # `avatar_url` varchar(255) DEFAULT NULL, # `name` varchar(255) DEFAULT NULL, # `company` varchar(255) DEFAULT NULL, # `location` varchar(255) DEFAULT NULL, # `followers` int DEFAULT NULL, # `author_id` varchar(50) DEFAULT NULL, # `type` varchar(50) DEFAULT NULL, # `login` varchar(255) DEFAULT NULL, # `created_at` varchar(255) DEFAULT NULL, # `updated_at` varchar(255) DEFAULT NULL, # KEY `actor_email` (`actor_email`) #) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci; #------------------------------------------------------------------ #运行环境python 2.7.18 mysql5.7 from __future__ import division import os,requests,urllib,sys,time,json,urllib2,gzip,random import MySQLdb from StringIO import StringIO #下载一个github页面 def api_github(myurl,github_token): #try: github_cookies='logged_in=no;tz=Asia%2FShanghai;_octo=GH1.1.693787499.1651047541;_gh_sess=zJpIR8dslqtlf9xuaA55ErkQoLjfVy3a8tiFpjF%2Fkr9%2F8VU%2BNjjujqysHHTC3lmOYUAhf1TEhky0CYq4XJPmwPi8duA1Eaot4Z%2FMIhE%2BPK6VDp4bO%2BUgqIlO3cbdAHSyc0R9Oynm%2FiRbL%2BuVV2Tt5lf2RVlUIfgN5FfCIodnJhozuuXCRXn7FSqMvR7KG9EX%2FQKktDHRYrnBosUpnV5JBIbZpYJ8qLuy5cReJSl8BlSYo8nqzKlYn3%2FO7%2BrwG916a4VjZPotuZ4c1C4yQZMgaA%3D%3D--JQ5IVzgrJsm%2F5gR7--HNRdOQU6i7zG1%2BK8LVeMVA%3D%3D;' if os.path.exists('github_cookies.txt'): with open("github_cookies.txt", "r") as f: # 打开文件 github_cookies = f.read() # 读取文件 github_cookies=github_cookies.strip() req = urllib2.Request(myurl) req.add_header("Cookie",github_cookies) req.add_header("Authorization",'token '+github_token) req.add_header('Accept-Encoding','gzip, deflate') req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4844.82 Safari/537.36') req.add_header("Referer","https://www.github.com/" ) res = urllib2.urlopen(req) if res.info().get('Content-Encoding') == 'gzip': buf = StringIO(res.read()) f = gzip.GzipFile(fileobj=buf) cp = f.read() else: cp = res.read() #print cp res.close() return cp #except: # print "error:"+myurl # return None if __name__=="__main__": conn=MySQLdb.connect(host="",user="",passwd="",db="",charset="utf8") #mysql连接信息 cursor = conn.cursor() while 1: #轮询github key列表 token_list=[""] # GitHub 的 PTAs random.shuffle(token_list) #随机选取1000条未处理数据,方便多线程调度 sql='SELECT actor_email,any_repo_path,any_commit_id from repo where actor_email is not null and gpt is null order by rand() limit 1000' cursor.execute(sql) rows=cursor.fetchall() x=0 for row in rows: try: actor_email=row[0] any_repo_path=row[1] any_commit_id=row[2] print (actor_email) #检查本记录是否已经被处理 sql='SELECT gpt from repo where actor_email="'+actor_email+'"' cursor.execute(sql) gpts=cursor.fetchone() if gpts[0]>0: continue if x>=len(token_list)-1: x=0 else: x=x+1 mytoken=token_list[x] print(mytoken) print('https://api.github.com/repos/'+any_repo_path+'/commits/'+any_commit_id) while 1: try: myjson=api_github('https://api.github.com/repos/'+any_repo_path+'/commits/'+any_commit_id,mytoken) break except Exception, e: #过滤没有返回值的错误 if str(e).find('HTTP Error 422')>=0: sql="update repo set gpt=%s where actor_email=%s" param=(2,actor_email) cursor=conn.cursor() n=cursor.execute(sql,param) conn.commit() break print('----------------get url error---------------------------') print(str(e)) print(mytoken) print('https://api.github.com/repos/'+any_repo_path+'/commits/'+any_commit_id) time.sleep(5) json_list=json.loads(myjson) if json_list!=None: author_url=json_list['author']["url"] myjson=api_github(author_url,mytoken) json_list=json.loads(myjson) if json_list!=None: avatar_url=json_list['avatar_url'] type=json_list['type'] name=json_list['name'] company=json_list['company'] location=json_list['location'] followers=json_list['followers'] id=json_list['id'] login=json_list['login'] created_at=json_list['created_at'] updated_at=json_list['updated_at'] #数据保存到库 sql="update repo set avatar_url=%s,type=%s,name=%s,company=%s,location=%s,followers=%s,author_id=%s,login=%s,gpt=%s,created_at=%s,updated_at=%s where actor_email=%s" param=(avatar_url,type,name,company,location,followers,id,login,1,created_at,updated_at,actor_email) cursor=conn.cursor() n=cursor.execute(sql,param) conn.commit() except: print('error') #错误记录到库 sql="update repo set gpt=%s where actor_email=%s" param=(2,actor_email) cursor=conn.cursor() n=cursor.execute(sql,param) conn.commit()