提交 41053a6f 编写于 作者: F feilong

init repo rank calc

上级 1ca50867
无法预览此类型文件
因为 它太大了无法显示 source diff 。你可以改为 查看blob
import os import os
import numpy as np
import pandas as pd import pandas as pd
config = { config = {
"ranks": { "ranks": {
"personal": { "personal": {
"top_n": "../rank/个人向主要开源技术栈贡献榜单.csv", "top_n": "../ranks/个人向主要开源技术栈贡献榜单.csv",
"top_n_en": "../rank/个人向国际主要开源技术栈贡献榜单.csv", "top_n_en": "../ranks/个人向国际主要开源技术栈贡献榜单.csv",
"top_n_zh_cn": "../rank/个人向中国主要开源技术栈贡献榜单.csv", "top_n_zh_cn": "../ranks/个人向中国主要开源技术栈贡献榜单.csv",
}, },
"company": { "company": {
"top_n": "../rank/公司向主要开源技术栈贡献榜单.csv", "top_n": "../ranks/公司向主要开源技术栈贡献榜单.csv",
"top_n_en": "../rank/公司向国际主要开源技术栈贡献榜单.csv", "top_n_en": "../ranks/公司向国际主要开源技术栈贡献榜单.csv",
"top_n_zh_cn": "../rank/公司向中国主要开源技术栈贡献榜单.csv", "top_n_zh_cn": "../ranks/公司向中国主要开源技术栈贡献榜单.csv",
}, },
"repo": { "repo": {
"top_n": "../rank/开源项目榜.csv", "top_n": "../ranks/开源项目榜.csv",
"top_n_en": "../rank/开源项目榜_非中国项目.csv", "top_n_en": "../ranks/开源项目榜_非中国项目.csv",
"top_n_zh_cn": "../rank/开源项目榜_中国项目.csv" "top_n_zh_cn": "../ranks/开源项目榜_中国项目.csv"
} }
}, },
"schema": { "schema": {
...@@ -176,6 +177,32 @@ config = { ...@@ -176,6 +177,32 @@ config = {
} }
] ]
}, },
"repo_commit_rank": {
"file": "../CSDN/repo-commit-rank.csv",
"desc": "开源项目在Github的月commit变化",
"fields": [
{
"field_name": "actor_email",
"field_type": "str",
"desc": "用户邮箱"
},
{
"field_name": "sum_total",
"field_type": "int",
"desc": "用户累计Github项目贡献数"
},
{
"feild_name": "any_repo_path",
"field_type": "str",
"desc": "用户贡献过的任意一个Github仓库路径"
},
{
"field_name": "any_commit_id",
"field_type": "str",
"desc": "用户在上述贡献过的Github仓库里的任意一个commit"
}
]
},
"repo_github_active_trends": { "repo_github_active_trends": {
"file": "../PingCAP/项目活跃度变化.csv", "file": "../PingCAP/项目活跃度变化.csv",
"desc": "开源项目在Github上的月活跃度数据", "desc": "开源项目在Github上的月活跃度数据",
...@@ -257,7 +284,7 @@ def load_repo_github_user_info(config, ctx): ...@@ -257,7 +284,7 @@ def load_repo_github_user_info(config, ctx):
df = pd.read_excel( df = pd.read_excel(
schema["repo_github_user_info"]["file"], schema["repo_github_user_info"]["file"],
sheet_name=schema["repo_github_user_info"]["sheet_name"]) sheet_name=schema["repo_github_user_info"]["sheet_name"])
print(df.head()) df.fillna(value=0, inplace=True)
ctx["repo_github_user_info"] = df ctx["repo_github_user_info"] = df
...@@ -266,7 +293,7 @@ def load_repo_github_info(config, ctx): ...@@ -266,7 +293,7 @@ def load_repo_github_info(config, ctx):
df = pd.read_excel( df = pd.read_excel(
schema["repo_github_info"]["file"], schema["repo_github_info"]["file"],
sheet_name=schema["repo_github_info"]["sheet_name"]) sheet_name=schema["repo_github_info"]["sheet_name"])
print(df.head()) df.fillna(value=0, inplace=True)
ctx["repo_github_info"] = df ctx["repo_github_info"] = df
...@@ -274,50 +301,154 @@ def load_repo_csdn_trends(config, ctx): ...@@ -274,50 +301,154 @@ def load_repo_csdn_trends(config, ctx):
schema = config["schema"] schema = config["schema"]
df = pd.read_excel( df = pd.read_excel(
schema["repo_csdn_trends"]["file"], schema["repo_csdn_trends"]["file"],
sheet_name=schema["repo_github_info"]["sheet_name"]) sheet_name=schema["repo_csdn_trends"]["sheet_name"])
print(df.head()) df.fillna(value=0, inplace=True)
ctx["repo_csdn_trends"] = df ctx["repo_csdn_trends"] = df
def load_repo_commit_rank(config, ctx): def load_repo_commit_rank(config, ctx):
schema = config["schema"] schema = config["schema"]
df = pd.read_csv(schema["repo_commit_rank"]["file"]) df = pd.read_csv(schema["repo_commit_rank"]["file"])
print(df.columns) df.fillna(value=0, inplace=True)
ctx["repo_commit_rank"] = df ctx["repo_commit_rank"] = df
def load_repo_github_active_trends(config, ctx): def load_repo_github_active_trends(config, ctx):
schema = config["schema"] schema = config["schema"]
df = pd.read_csv(schema["repo_github_active_trends"]["file"]) df = pd.read_csv(schema["repo_github_active_trends"]["file"])
print(df.columns) df.fillna(value=0, inplace=True)
ctx["repo_github_active_trends"] = df ctx["repo_github_active_trends"] = df
def rank_personal_top_n(config, ctx): def load_repo_github_popular_trends(config, ctx):
pass schema = config["schema"]
df = pd.read_csv(schema["repo_github_popular_trends"]["file"])
df.fillna(value=0, inplace=True)
ctx["repo_github_popular_trends"] = df
def rank_company_top_n(config, ctx): def rank_repo_top_n(config, ctx):
repo_rank = []
repo_dict = {}
# 合并项目的总数据 repo_github_info 主键 FullName
df = ctx["repo_github_info"]
df.fillna(value=0)
for index, row in df.iterrows():
repo_item = {}
repo_key = row["FullName"].lower()
repo_item["region"] = row["Region"]
repo_item["star"] = int(row["Star"])
repo_item["fork"] = int(row["Fork"])
repo_item["contributors"] = int(row["Contributors"])
repo_dict[repo_key] = repo_item
repo_rank.append(repo_item)
# 合并项目的CSDN指数数据 repo_csdn_trends 主键 repo_name
df = ctx["repo_csdn_trends"]
for index, row in df.iterrows():
repo_name = row['repo_name']
repo_key = repo_name.lower()
repo_item = repo_dict.get(repo_key)
repo_item["csdn_index_month_avg"] = row[1:].mean()
# 合并项目的活跃变动数据 repo_github_active_trends 主键 repo_name
df = ctx["repo_github_active_trends"]
df.fillna(value=0)
df = df.groupby(["repo_name"]).agg(np.mean)
# print(df.loc['TheAlgorithms/Python'])
for index, row in df.iterrows():
repo_name = index
repo_key = repo_name.lower()
repo_item = repo_dict.get(repo_key)
repo_item["push_count_month_avg"] = row["push_count"]
repo_item["pr_count_month_avg"] = row["pr_count"]
repo_item["issue_count_month_avg"] = row["issue_count"]
repo_item["creator_count_month_avg"] = row["creator_count"]
# 合并项目的激活变动数据 repo_github_popular_trends 主键 repo_name
df = ctx["repo_github_popular_trends"]
df = df.groupby(["repo_name"]).agg(np.mean)
for index, row in df.iterrows():
repo_name = index
repo_key = repo_name.lower()
repo_item = repo_dict.get(repo_key)
repo_item["watch_count_month_avg"] = row["watch_count"]
repo_item["fork_count_month_avg"] = row["fork_count"]
# 合并表
df = pd.DataFrame.from_dict(repo_dict, orient='index')
df.fillna(value=0, inplace=True)
df.reset_index()
# 计算榜单得分
weights = {
"star": 1,
"fork": 1,
"contributors": 1,
"csdn_index_month_avg": 1,
"push_count_month_avg": 1,
"pr_count_month_avg": 1,
"issue_count_month_avg": 1,
"creator_count_month_avg": 1,
"watch_count_month_avg": 1,
"fork_count_month_avg": 1
}
total_weight_value = 0
for key in weights:
total_weight_value += weights[key]
for key in weights:
weights[key] = weights[key]/total_weight_value
df["score"] = 0
for key in weights:
df["score"] += df[key].apply(lambda x: x*weights[key])
print(df.head())
df = df.sort_values(by="score", ascending=False)
df.to_csv(config["ranks"]["repo"]["top_n"])
def rank_personal_top_n(config, ctx):
pass pass
def rank_repo_top_n(config, ctx): def rank_company_top_n(config, ctx):
pass pass
def main(config): def main(config):
ctx = {} ctx = {}
load_repo_github_user_info(config, ctx) # print("@load_repo_github_user_info..")
# load_repo_github_user_info(config, ctx)
# print("@load_repo_commit_rank..")
# load_repo_commit_rank(config, ctx)
print("@load_repo_github_info..")
load_repo_github_info(config, ctx) load_repo_github_info(config, ctx)
print("@load_repo_csdn_trends..")
load_repo_csdn_trends(config, ctx) load_repo_csdn_trends(config, ctx)
load_repo_commit_rank(config, ctx)
print("@load_repo_github_active_trends..")
load_repo_github_active_trends(config, ctx) load_repo_github_active_trends(config, ctx)
rank_personal_top_n(config, ctx) print("@load_repo_github_popular_trends..")
rank_company_top_n(config, ctx) load_repo_github_popular_trends(config, ctx)
print("@rank_repo_top_n..")
rank_repo_top_n(config, ctx) rank_repo_top_n(config, ctx)
# print("@rank_personal_top_n..")
# rank_personal_top_n(config, ctx)
# print("@rank_company_top_n..")
# rank_company_top_n(config, ctx)
if __name__ == "__main__": if __name__ == "__main__":
main(config) main(config)
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册