提交 89cef470 编写于 作者: F feilong

add personal rank

上级 656cf5f7
此差异已折叠。
此差异已折叠。
...@@ -103,6 +103,32 @@ config = { ...@@ -103,6 +103,32 @@ config = {
} }
] ]
}, },
"repo_github_user_commit_info": {
"file": "../data/CSDN/commit_analysis.csv",
"desc": "开发者在开源项目的贡献数信息",
"fields": [
{
"field_name": "repo_name",
"field_type": "str",
"desc": "开源项目仓库名字"
},
{
"field_name": "any_commit_id",
"field_type": "str",
"desc": "开发者在对应开源项目上贡献的任意commit id"
},
{
"field_name": "actor_email",
"field_type": "str",
"desc": "用户邮箱"
},
{
"field_name": "total",
"field_type": "int",
"desc": "开发者累计Github项目贡献数"
},
]
},
"repo_github_info": { "repo_github_info": {
"file": "../data/Github-Repos.xlsx", "file": "../data/Github-Repos.xlsx",
"sheet_name": "汇总", "sheet_name": "汇总",
......
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
from audioop import reverse
import os import os
import numpy as np import numpy as np
import pandas as pd import pandas as pd
from config import config
def load_repo_github_user_info(config, ctx):
schema = config["schema"]
df = pd.read_excel(
schema["repo_github_user_info"]["file"],
sheet_name=schema["repo_github_user_info"]["sheet_name"])
df.fillna(value=0, inplace=True)
ctx["repo_github_user_info"] = df
def load_repo_github_info(config, ctx): def load_repo_github_info(config, ctx):
...@@ -53,6 +44,33 @@ def load_repo_github_popular_trends(config, ctx): ...@@ -53,6 +44,33 @@ def load_repo_github_popular_trends(config, ctx):
ctx["repo_github_popular_trends"] = df ctx["repo_github_popular_trends"] = df
def load_repo_github_user_info(config, ctx):
# bar = None
# def show_pd_read_excel_progress(cur, tt):
# global bar
# if bar is None:
# bar = IncrementalBar("@开始加载用户仓库贡献表:", max=tt)
# else:
# bar.next(1)
schema = config["schema"]
df = pd.read_excel(
schema["repo_github_user_info"]["file"],
sheet_name=schema["repo_github_user_info"]["sheet_name"])
df.fillna(value=0, inplace=True)
ctx["repo_github_user_info"] = df
def load_repo_github_user_commit_info(config, ctx):
schema = config["schema"]
df = pd.read_csv(
schema["repo_github_user_commit_info"]["file"])
df.fillna(value=0, inplace=True)
ctx["repo_github_user_commit_info"] = df
def rank_repo_top_n(config, ctx): def rank_repo_top_n(config, ctx):
repo_rank = [] repo_rank = []
repo_dict = {} repo_dict = {}
...@@ -136,24 +154,131 @@ def rank_repo_top_n(config, ctx): ...@@ -136,24 +154,131 @@ def rank_repo_top_n(config, ctx):
df = df.sort_values(by="score", ascending=False) df = df.sort_values(by="score", ascending=False)
df.to_csv(config["ranks"]["repo"]["top_n"]) df.to_csv(config["ranks"]["repo"]["top_n"])
for index, row in df.iterrows():
repo_name = index
repo_item = repo_dict[repo_name]
repo_item["score"] = row["score"]
ctx["ranks"]["repo_top_n"] = df
ctx["ranks"]["repo_top_n_dict"] = repo_dict
def rank_personal_top_n(config, ctx): def rank_personal_top_n(config, ctx):
pass repo_top_n_dict = ctx["ranks"]["repo_top_n_dict"]
# 合并 repo_github_user_info 表,主键是 actor_email
personal_dict = {}
df = ctx["repo_github_user_info"]
for index, row in df.iterrows():
email = row["actor_email"]
def rank_company_top_n(config, ctx): # 忽略错误数据
pass if type(email) != type(""):
continue
person_key = email.lower()
person = {}
personal_dict[person_key] = person
# 用户基本信息
person['avatar_url'] = row['avatar_url']
person['name'] = row['name']
person['company'] = row['company']
person['location'] = row['location']
person['followers'] = row['followers']
person['author_id'] = row['author_id']
person['type'] = row['type']
person['login'] = row['login']
# 用户统计数据
person['all_repo_contribute_total'] = int(row['sum_total'])
# 合并 repo_github_user_info 仓库排行表记录
df = ctx["repo_github_user_commit_info"]
for index, row in df.iterrows():
email = row["actor_email"]
# 忽略错误数据
if type(email) != type(""):
continue
total = row["total"]
repo_name = row["repo_name"]
person_key = email.lower()
repo_key = repo_name.lower()
repo_item = repo_top_n_dict.get(repo_key)
if repo_item is not None:
repo_user_contribute_list = repo_item.get('user_contribute_list')
if repo_user_contribute_list is None:
repo_user_contribute_list = []
repo_item['user_contribute_list'] = repo_user_contribute_list
# 项目的用户贡献者列表增加
repo_user_contribute_list.append({
"actor_email": email,
"total": total,
"repo_score": repo_item['score'] # 项目得分
})
# 项目内的用户贡献排序
for repo_key in repo_top_n_dict:
repo_item = repo_top_n_dict[repo_key]
repo_user_contribute_list = repo_item.get('user_contribute_list')
if repo_user_contribute_list is not None:
repo_user_contribute_list.sort(
key=lambda x: x["total"], reverse=True)
else:
print(f"[warn] missing contributors repo:{repo_key}")
# 现在,合并每个项目的用户贡献到一个总表
personal_score_dict = {}
for repo_key in repo_top_n_dict:
repo_item = repo_top_n_dict[repo_key]
repo_user_contribute_list = repo_item.get('user_contribute_list')
if repo_user_contribute_list is None:
continue
for repo_person in repo_user_contribute_list:
email = repo_person['actor_email']
person_key = email.lower()
person_item = personal_score_dict.get(person_key)
if person_item is None:
person_item = {
"score": repo_person["total"]*repo_person["repo_score"],
}
personal_score_dict[person_key] = person_item
person_info = personal_dict.get(person_key)
# 忽略不匹配数据
if person_info is None:
continue
person_item['avatar_url'] = person_info['avatar_url']
person_item['name'] = person_info['name']
person_item['company'] = person_info['company']
person_item['location'] = person_info['location']
person_item['followers'] = person_info['followers']
person_item['author_id'] = person_info['author_id']
person_item['type'] = person_info['type']
person_item['login'] = person_info['login']
else:
person_item['score'] += repo_person["total"] * \
repo_person["repo_score"]
# 排行
df = pd.DataFrame.from_dict(personal_score_dict, orient='index')
df.sort_values(by="score", ascending=False)
df.to_csv(config["ranks"]["personal"]["top_n"])
print(df.head())
def calc_ranks(config, options):
ctx = {}
# print("@load_repo_github_user_info..") def rank_company_top_n(config, ctx):
# load_repo_github_user_info(config, ctx) pass
# print("@load_repo_commit_rank..")
# load_repo_commit_rank(config, ctx)
def calc_repo_rank(config, options, ctx):
print("@load_repo_github_info..") print("@load_repo_github_info..")
load_repo_github_info(config, ctx) load_repo_github_info(config, ctx)
...@@ -169,8 +294,29 @@ def calc_ranks(config, options): ...@@ -169,8 +294,29 @@ def calc_ranks(config, options):
print("@rank_repo_top_n..") print("@rank_repo_top_n..")
rank_repo_top_n(config, ctx) rank_repo_top_n(config, ctx)
# print("@rank_personal_top_n..")
# rank_personal_top_n(config, ctx)
def calc_personal_rank(config, options, ctx):
print("@load_repo_github_user_info..")
load_repo_github_user_info(config, ctx)
print("@load_repo_github_user_commit_info..")
load_repo_github_user_commit_info(config, ctx)
print("@rank_personal_top_n..")
rank_personal_top_n(config, ctx)
def calc_company_rank(config, options, ctx):
# print("@rank_company_top_n..") # print("@rank_company_top_n..")
# rank_company_top_n(config, ctx) # rank_company_top_n(config, ctx)
pass
def calc_ranks(config, options):
ctx = {
"ranks": {}
}
calc_repo_rank(config, options, ctx)
calc_personal_rank(config, options, ctx)
# calc_company_rank(config, options, ctx)
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册