提交 41053a6f 编写于 作者: F feilong

init repo rank calc

上级 1ca50867
无法预览此类型文件
此差异已折叠。
import os import os
import numpy as np
import pandas as pd import pandas as pd
config = { config = {
"ranks": { "ranks": {
"personal": { "personal": {
"top_n": "../rank/个人向主要开源技术栈贡献榜单.csv", "top_n": "../ranks/个人向主要开源技术栈贡献榜单.csv",
"top_n_en": "../rank/个人向国际主要开源技术栈贡献榜单.csv", "top_n_en": "../ranks/个人向国际主要开源技术栈贡献榜单.csv",
"top_n_zh_cn": "../rank/个人向中国主要开源技术栈贡献榜单.csv", "top_n_zh_cn": "../ranks/个人向中国主要开源技术栈贡献榜单.csv",
}, },
"company": { "company": {
"top_n": "../rank/公司向主要开源技术栈贡献榜单.csv", "top_n": "../ranks/公司向主要开源技术栈贡献榜单.csv",
"top_n_en": "../rank/公司向国际主要开源技术栈贡献榜单.csv", "top_n_en": "../ranks/公司向国际主要开源技术栈贡献榜单.csv",
"top_n_zh_cn": "../rank/公司向中国主要开源技术栈贡献榜单.csv", "top_n_zh_cn": "../ranks/公司向中国主要开源技术栈贡献榜单.csv",
}, },
"repo": { "repo": {
"top_n": "../rank/开源项目榜.csv", "top_n": "../ranks/开源项目榜.csv",
"top_n_en": "../rank/开源项目榜_非中国项目.csv", "top_n_en": "../ranks/开源项目榜_非中国项目.csv",
"top_n_zh_cn": "../rank/开源项目榜_中国项目.csv" "top_n_zh_cn": "../ranks/开源项目榜_中国项目.csv"
} }
}, },
"schema": { "schema": {
...@@ -176,6 +177,32 @@ config = { ...@@ -176,6 +177,32 @@ config = {
} }
] ]
}, },
"repo_commit_rank": {
"file": "../CSDN/repo-commit-rank.csv",
"desc": "开源项目在Github的月commit变化",
"fields": [
{
"field_name": "actor_email",
"field_type": "str",
"desc": "用户邮箱"
},
{
"field_name": "sum_total",
"field_type": "int",
"desc": "用户累计Github项目贡献数"
},
{
"feild_name": "any_repo_path",
"field_type": "str",
"desc": "用户贡献过的任意一个Github仓库路径"
},
{
"field_name": "any_commit_id",
"field_type": "str",
"desc": "用户在上述贡献过的Github仓库里的任意一个commit"
}
]
},
"repo_github_active_trends": { "repo_github_active_trends": {
"file": "../PingCAP/项目活跃度变化.csv", "file": "../PingCAP/项目活跃度变化.csv",
"desc": "开源项目在Github上的月活跃度数据", "desc": "开源项目在Github上的月活跃度数据",
...@@ -257,7 +284,7 @@ def load_repo_github_user_info(config, ctx): ...@@ -257,7 +284,7 @@ def load_repo_github_user_info(config, ctx):
df = pd.read_excel( df = pd.read_excel(
schema["repo_github_user_info"]["file"], schema["repo_github_user_info"]["file"],
sheet_name=schema["repo_github_user_info"]["sheet_name"]) sheet_name=schema["repo_github_user_info"]["sheet_name"])
print(df.head()) df.fillna(value=0, inplace=True)
ctx["repo_github_user_info"] = df ctx["repo_github_user_info"] = df
...@@ -266,7 +293,7 @@ def load_repo_github_info(config, ctx): ...@@ -266,7 +293,7 @@ def load_repo_github_info(config, ctx):
df = pd.read_excel( df = pd.read_excel(
schema["repo_github_info"]["file"], schema["repo_github_info"]["file"],
sheet_name=schema["repo_github_info"]["sheet_name"]) sheet_name=schema["repo_github_info"]["sheet_name"])
print(df.head()) df.fillna(value=0, inplace=True)
ctx["repo_github_info"] = df ctx["repo_github_info"] = df
...@@ -274,50 +301,154 @@ def load_repo_csdn_trends(config, ctx): ...@@ -274,50 +301,154 @@ def load_repo_csdn_trends(config, ctx):
schema = config["schema"] schema = config["schema"]
df = pd.read_excel( df = pd.read_excel(
schema["repo_csdn_trends"]["file"], schema["repo_csdn_trends"]["file"],
sheet_name=schema["repo_github_info"]["sheet_name"]) sheet_name=schema["repo_csdn_trends"]["sheet_name"])
print(df.head()) df.fillna(value=0, inplace=True)
ctx["repo_csdn_trends"] = df ctx["repo_csdn_trends"] = df
def load_repo_commit_rank(config, ctx): def load_repo_commit_rank(config, ctx):
schema = config["schema"] schema = config["schema"]
df = pd.read_csv(schema["repo_commit_rank"]["file"]) df = pd.read_csv(schema["repo_commit_rank"]["file"])
print(df.columns) df.fillna(value=0, inplace=True)
ctx["repo_commit_rank"] = df ctx["repo_commit_rank"] = df
def load_repo_github_active_trends(config, ctx): def load_repo_github_active_trends(config, ctx):
schema = config["schema"] schema = config["schema"]
df = pd.read_csv(schema["repo_github_active_trends"]["file"]) df = pd.read_csv(schema["repo_github_active_trends"]["file"])
print(df.columns) df.fillna(value=0, inplace=True)
ctx["repo_github_active_trends"] = df ctx["repo_github_active_trends"] = df
def rank_personal_top_n(config, ctx): def load_repo_github_popular_trends(config, ctx):
pass schema = config["schema"]
df = pd.read_csv(schema["repo_github_popular_trends"]["file"])
df.fillna(value=0, inplace=True)
ctx["repo_github_popular_trends"] = df
def rank_company_top_n(config, ctx): def rank_repo_top_n(config, ctx):
repo_rank = []
repo_dict = {}
# 合并项目的总数据 repo_github_info 主键 FullName
df = ctx["repo_github_info"]
df.fillna(value=0)
for index, row in df.iterrows():
repo_item = {}
repo_key = row["FullName"].lower()
repo_item["region"] = row["Region"]
repo_item["star"] = int(row["Star"])
repo_item["fork"] = int(row["Fork"])
repo_item["contributors"] = int(row["Contributors"])
repo_dict[repo_key] = repo_item
repo_rank.append(repo_item)
# 合并项目的CSDN指数数据 repo_csdn_trends 主键 repo_name
df = ctx["repo_csdn_trends"]
for index, row in df.iterrows():
repo_name = row['repo_name']
repo_key = repo_name.lower()
repo_item = repo_dict.get(repo_key)
repo_item["csdn_index_month_avg"] = row[1:].mean()
# 合并项目的活跃变动数据 repo_github_active_trends 主键 repo_name
df = ctx["repo_github_active_trends"]
df.fillna(value=0)
df = df.groupby(["repo_name"]).agg(np.mean)
# print(df.loc['TheAlgorithms/Python'])
for index, row in df.iterrows():
repo_name = index
repo_key = repo_name.lower()
repo_item = repo_dict.get(repo_key)
repo_item["push_count_month_avg"] = row["push_count"]
repo_item["pr_count_month_avg"] = row["pr_count"]
repo_item["issue_count_month_avg"] = row["issue_count"]
repo_item["creator_count_month_avg"] = row["creator_count"]
# 合并项目的激活变动数据 repo_github_popular_trends 主键 repo_name
df = ctx["repo_github_popular_trends"]
df = df.groupby(["repo_name"]).agg(np.mean)
for index, row in df.iterrows():
repo_name = index
repo_key = repo_name.lower()
repo_item = repo_dict.get(repo_key)
repo_item["watch_count_month_avg"] = row["watch_count"]
repo_item["fork_count_month_avg"] = row["fork_count"]
# 合并表
df = pd.DataFrame.from_dict(repo_dict, orient='index')
df.fillna(value=0, inplace=True)
df.reset_index()
# 计算榜单得分
weights = {
"star": 1,
"fork": 1,
"contributors": 1,
"csdn_index_month_avg": 1,
"push_count_month_avg": 1,
"pr_count_month_avg": 1,
"issue_count_month_avg": 1,
"creator_count_month_avg": 1,
"watch_count_month_avg": 1,
"fork_count_month_avg": 1
}
total_weight_value = 0
for key in weights:
total_weight_value += weights[key]
for key in weights:
weights[key] = weights[key]/total_weight_value
df["score"] = 0
for key in weights:
df["score"] += df[key].apply(lambda x: x*weights[key])
print(df.head())
df = df.sort_values(by="score", ascending=False)
df.to_csv(config["ranks"]["repo"]["top_n"])
def rank_personal_top_n(config, ctx):
pass pass
def rank_repo_top_n(config, ctx): def rank_company_top_n(config, ctx):
pass pass
def main(config): def main(config):
ctx = {} ctx = {}
load_repo_github_user_info(config, ctx) # print("@load_repo_github_user_info..")
# load_repo_github_user_info(config, ctx)
# print("@load_repo_commit_rank..")
# load_repo_commit_rank(config, ctx)
print("@load_repo_github_info..")
load_repo_github_info(config, ctx) load_repo_github_info(config, ctx)
print("@load_repo_csdn_trends..")
load_repo_csdn_trends(config, ctx) load_repo_csdn_trends(config, ctx)
load_repo_commit_rank(config, ctx)
print("@load_repo_github_active_trends..")
load_repo_github_active_trends(config, ctx) load_repo_github_active_trends(config, ctx)
rank_personal_top_n(config, ctx) print("@load_repo_github_popular_trends..")
rank_company_top_n(config, ctx) load_repo_github_popular_trends(config, ctx)
print("@rank_repo_top_n..")
rank_repo_top_n(config, ctx) rank_repo_top_n(config, ctx)
# print("@rank_personal_top_n..")
# rank_personal_top_n(config, ctx)
# print("@rank_company_top_n..")
# rank_company_top_n(config, ctx)
if __name__ == "__main__": if __name__ == "__main__":
main(config) main(config)
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册