Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
CSDN 技术社区
1024 Report
提交
ad67972f
1
1024 Report
项目概览
CSDN 技术社区
/
1024 Report
通知
84
Star
6
Fork
1
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
1
1024 Report
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
提交
ad67972f
编写于
10月 13, 2022
作者:
Miykael_xxm
🚴
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
add commit author info crawler python script by DaXiaoQiang
上级
78a1639d
变更
1
显示空白变更内容
内联
并排
Showing
1 changed file
with
164 addition
and
0 deletion
+164
-0
src/github_author_claw.py
src/github_author_claw.py
+164
-0
未找到文件。
src/github_author_claw.py
0 → 100644
浏览文件 @
ad67972f
#!/usr/bin/env pyhton3
# coding=utf8
# auhtor: @DaXiaoQiang
#-------------------相关表结构----------------------------------
#CREATE TABLE `repo` (
# `actor_email` varchar(255) DEFAULT NULL,
# `sum_total` varchar(255) DEFAULT NULL,
# `any_repo_path` varchar(255) DEFAULT NULL,
# `any_commit_id` varchar(255) DEFAULT NULL,
# `gpt` int DEFAULT NULL,
# `avatar_url` varchar(255) DEFAULT NULL,
# `name` varchar(255) DEFAULT NULL,
# `company` varchar(255) DEFAULT NULL,
# `location` varchar(255) DEFAULT NULL,
# `followers` int DEFAULT NULL,
# `author_id` varchar(50) DEFAULT NULL,
# `type` varchar(50) DEFAULT NULL,
# `login` varchar(255) DEFAULT NULL,
# `created_at` varchar(255) DEFAULT NULL,
# `updated_at` varchar(255) DEFAULT NULL,
# KEY `actor_email` (`actor_email`)
#) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci;
#------------------------------------------------------------------
#运行环境python 2.7.18 mysql5.7
from
__future__
import
division
import
os
,
requests
,
urllib
,
sys
,
time
,
json
,
urllib2
,
gzip
,
random
import
MySQLdb
from
StringIO
import
StringIO
#下载一个github页面
def
api_github
(
myurl
,
github_token
):
#try:
github_cookies
=
'logged_in=no;tz=Asia%2FShanghai;_octo=GH1.1.693787499.1651047541;_gh_sess=zJpIR8dslqtlf9xuaA55ErkQoLjfVy3a8tiFpjF%2Fkr9%2F8VU%2BNjjujqysHHTC3lmOYUAhf1TEhky0CYq4XJPmwPi8duA1Eaot4Z%2FMIhE%2BPK6VDp4bO%2BUgqIlO3cbdAHSyc0R9Oynm%2FiRbL%2BuVV2Tt5lf2RVlUIfgN5FfCIodnJhozuuXCRXn7FSqMvR7KG9EX%2FQKktDHRYrnBosUpnV5JBIbZpYJ8qLuy5cReJSl8BlSYo8nqzKlYn3%2FO7%2BrwG916a4VjZPotuZ4c1C4yQZMgaA%3D%3D--JQ5IVzgrJsm%2F5gR7--HNRdOQU6i7zG1%2BK8LVeMVA%3D%3D;'
if
os
.
path
.
exists
(
'github_cookies.txt'
):
with
open
(
"github_cookies.txt"
,
"r"
)
as
f
:
# 打开文件
github_cookies
=
f
.
read
()
# 读取文件
github_cookies
=
github_cookies
.
strip
()
req
=
urllib2
.
Request
(
myurl
)
req
.
add_header
(
"Cookie"
,
github_cookies
)
req
.
add_header
(
"Authorization"
,
'token '
+
github_token
)
req
.
add_header
(
'Accept-Encoding'
,
'gzip, deflate'
)
req
.
add_header
(
'User-Agent'
,
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4844.82 Safari/537.36'
)
req
.
add_header
(
"Referer"
,
"https://www.github.com/"
)
res
=
urllib2
.
urlopen
(
req
)
if
res
.
info
().
get
(
'Content-Encoding'
)
==
'gzip'
:
buf
=
StringIO
(
res
.
read
())
f
=
gzip
.
GzipFile
(
fileobj
=
buf
)
cp
=
f
.
read
()
else
:
cp
=
res
.
read
()
#print cp
res
.
close
()
return
cp
#except:
# print "error:"+myurl
# return None
if
__name__
==
"__main__"
:
conn
=
MySQLdb
.
connect
(
host
=
""
,
user
=
""
,
passwd
=
""
,
db
=
""
,
charset
=
"utf8"
)
#mysql连接信息
cursor
=
conn
.
cursor
()
while
1
:
#轮询github key列表
token_list
=
[
""
]
# GitHub 的 PTAs
random
.
shuffle
(
token_list
)
#随机选取1000条未处理数据,方便多线程调度
sql
=
'SELECT actor_email,any_repo_path,any_commit_id from repo where actor_email is not null and gpt is null order by rand() limit 1000'
cursor
.
execute
(
sql
)
rows
=
cursor
.
fetchall
()
x
=
0
for
row
in
rows
:
try
:
actor_email
=
row
[
0
]
any_repo_path
=
row
[
1
]
any_commit_id
=
row
[
2
]
print
(
actor_email
)
#检查本记录是否已经被处理
sql
=
'SELECT gpt from repo where actor_email="'
+
actor_email
+
'"'
cursor
.
execute
(
sql
)
gpts
=
cursor
.
fetchone
()
if
gpts
[
0
]
>
0
:
continue
if
x
>=
len
(
token_list
)
-
1
:
x
=
0
else
:
x
=
x
+
1
mytoken
=
token_list
[
x
]
print
(
mytoken
)
print
(
'https://api.github.com/repos/'
+
any_repo_path
+
'/commits/'
+
any_commit_id
)
while
1
:
try
:
myjson
=
api_github
(
'https://api.github.com/repos/'
+
any_repo_path
+
'/commits/'
+
any_commit_id
,
mytoken
)
break
except
Exception
,
e
:
#过滤没有返回值的错误
if
str
(
e
).
find
(
'HTTP Error 422'
)
>=
0
:
sql
=
"update repo set gpt=%s where actor_email=%s"
param
=
(
2
,
actor_email
)
cursor
=
conn
.
cursor
()
n
=
cursor
.
execute
(
sql
,
param
)
conn
.
commit
()
break
print
(
'----------------get url error---------------------------'
)
print
(
str
(
e
))
print
(
mytoken
)
print
(
'https://api.github.com/repos/'
+
any_repo_path
+
'/commits/'
+
any_commit_id
)
time
.
sleep
(
5
)
json_list
=
json
.
loads
(
myjson
)
if
json_list
!=
None
:
author_url
=
json_list
[
'author'
][
"url"
]
myjson
=
api_github
(
author_url
,
mytoken
)
json_list
=
json
.
loads
(
myjson
)
if
json_list
!=
None
:
avatar_url
=
json_list
[
'avatar_url'
]
type
=
json_list
[
'type'
]
name
=
json_list
[
'name'
]
company
=
json_list
[
'company'
]
location
=
json_list
[
'location'
]
followers
=
json_list
[
'followers'
]
id
=
json_list
[
'id'
]
login
=
json_list
[
'login'
]
created_at
=
json_list
[
'created_at'
]
updated_at
=
json_list
[
'updated_at'
]
#数据保存到库
sql
=
"update repo set avatar_url=%s,type=%s,name=%s,company=%s,location=%s,followers=%s,author_id=%s,login=%s,gpt=%s,created_at=%s,updated_at=%s where actor_email=%s"
param
=
(
avatar_url
,
type
,
name
,
company
,
location
,
followers
,
id
,
login
,
1
,
created_at
,
updated_at
,
actor_email
)
cursor
=
conn
.
cursor
()
n
=
cursor
.
execute
(
sql
,
param
)
conn
.
commit
()
except
:
print
(
'error'
)
#错误记录到库
sql
=
"update repo set gpt=%s where actor_email=%s"
param
=
(
2
,
actor_email
)
cursor
=
conn
.
cursor
()
n
=
cursor
.
execute
(
sql
,
param
)
conn
.
commit
()
\ No newline at end of file
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录