From 3f72bb3c8b9a10ec2fda81098b7998c264befd8d Mon Sep 17 00:00:00 2001 From: Miykaelxxm Date: Thu, 13 Oct 2022 14:47:48 +0800 Subject: [PATCH] add repo info crawler python script --- src/repoinfo.py | 43 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) create mode 100644 src/repoinfo.py diff --git a/src/repoinfo.py b/src/repoinfo.py new file mode 100644 index 0000000..376f611 --- /dev/null +++ b/src/repoinfo.py @@ -0,0 +1,43 @@ +# -*- coding: utf-8 -*- +from os import link +import requests +import json +import time +import re + +# 读取json文件 +with open("dataset/repo-list.json", 'r') as f: + data = json.load(f) + +with open('dataset/result.txt', 'r+') as file: + file.truncate(0) + +for list_item in data: + # print(list_item["full_name"]) + a = list_item["full_name"] + + url = "https://api.github.com/repos/" + a + # url2 = "https://api.github.com/repos/" + a + "/contributors?per_page=1&anon=true" + + payload={} + headers = { + 'Authorization': '' # 这里填入你自己的 GitHub Personal Access Token + } + + response = requests.request("GET", url, headers=headers, data=payload) + + # response2 = requests.request("GET", url2, headers=headers, data=payload) + + # headers2 = requests.get(url2).headers + # print(headers2) + # pagesize = [int(s) for s in re.findall(r'\b\d+\b',headers2['Link'])] + + # print(str(response.json()["id"])+ ',' + response.json()["owner"]["login"] + ',' + response.json()["full_name"] + ',' + str(response.json()["stargazers_count"]) + ',' + str(response.json()["forks_count"]) + ',' + str(response.json()["open_issues"]) + ',' + str(pagesize[-1])) + # with open('dataset/result.txt', 'a') as f: + # f.write(str(response.json()["id"])+ ',' + response.json()["owner"]["login"] + ',' + response.json()["full_name"] + ',' + str(response.json()["stargazers_count"]) + ',' + str(response.json()["forks_count"]) + ',' + str(pagesize[-1]) + "\n") + # time.sleep(4) # Sleep for 2 seconds + + # print(str(response.json()["id"])+ ',' + response.json()["owner"]["login"] + ',' + response.json()["full_name"] + ',' + str(response.json()["stargazers_count"]) + ',' + str(response.json()["forks_count"]) + ',' + str(response.json()["open_issues"])) + with open('dataset/result.txt', 'a') as f: + f.write(str(response.json()["id"])+ ',' + response.json()["owner"]["login"] + ',' + response.json()["full_name"] + ',' + str(response.json()["stargazers_count"]) + ',' + str(response.json()["forks_count"]) + "\n") + time.sleep(2) # Sleep for 2 seconds \ No newline at end of file -- GitLab