init repo rank calc

41053a6f · feilong · 1ca50867 · 41053a6f · 41053a6f · 41053a6f
隐藏空白更改
内联并排

Showing with 154 addition and 23 deletion

GitHub-Repos.xlsx GitHub-Repos.xlsx +0 -0

ranks/开源项目榜.csv ranks/开源项目榜.csv +0 -0

src/main.py src/main.py +154 -23

未找到文件。
--- a/GitHub-Repos.xlsx
+++ b/GitHub-Repos.xlsx
--- a/ranks/开源项目榜.csv
+++ b/ranks/开源项目榜.csv
--- a/src/main.py
+++ b/src/main.py
 import os
+import numpy as np
 import pandas as pd
 config = {
    "ranks": {
        "personal": {
-            "top_n": "../rank/个人向主要开源技术栈贡献榜单.csv",
+            "top_n": "../ranks/个人向主要开源技术栈贡献榜单.csv",
-            "top_n_en": "../rank/个人向国际主要开源技术栈贡献榜单.csv",
+            "top_n_en": "../ranks/个人向国际主要开源技术栈贡献榜单.csv",
-            "top_n_zh_cn": "../rank/个人向中国主要开源技术栈贡献榜单.csv",
+            "top_n_zh_cn": "../ranks/个人向中国主要开源技术栈贡献榜单.csv",
        },
        "company": {
-            "top_n": "../rank/公司向主要开源技术栈贡献榜单.csv",
+            "top_n": "../ranks/公司向主要开源技术栈贡献榜单.csv",
-            "top_n_en": "../rank/公司向国际主要开源技术栈贡献榜单.csv",
+            "top_n_en": "../ranks/公司向国际主要开源技术栈贡献榜单.csv",
-            "top_n_zh_cn": "../rank/公司向中国主要开源技术栈贡献榜单.csv",
+            "top_n_zh_cn": "../ranks/公司向中国主要开源技术栈贡献榜单.csv",
        },
        "repo": {
-            "top_n": "../rank/开源项目榜.csv",
+            "top_n": "../ranks/开源项目榜.csv",
-            "top_n_en": "../rank/开源项目榜_非中国项目.csv",
+            "top_n_en": "../ranks/开源项目榜_非中国项目.csv",
-            "top_n_zh_cn": "../rank/开源项目榜_中国项目.csv"
+            "top_n_zh_cn": "../ranks/开源项目榜_中国项目.csv"
        }
    },
    "schema": {
@@ -176,6 +177,32 @@ config = {
                }
            ]
        },
+        "repo_commit_rank": {
+            "file": "../CSDN/repo-commit-rank.csv",
+            "desc": "开源项目在Github的月commit变化",
+            "fields": [
+                {
+                    "field_name": "actor_email",
+                    "field_type": "str",
+                    "desc": "用户邮箱"
+                },
+                {
+                    "field_name": "sum_total",
+                    "field_type": "int",
+                    "desc": "用户累计Github项目贡献数"
+                },
+                {
+                    "feild_name": "any_repo_path",
+                    "field_type": "str",
+                    "desc": "用户贡献过的任意一个Github仓库路径"
+                },
+                {
+                    "field_name": "any_commit_id",
+                    "field_type": "str",
+                    "desc": "用户在上述贡献过的Github仓库里的任意一个commit"
+                }
+            ]
+        },
        "repo_github_active_trends": {
            "file": "../PingCAP/项目活跃度变化.csv",
            "desc": "开源项目在Github上的月活跃度数据",
@@ -257,7 +284,7 @@ def load_repo_github_user_info(config, ctx):
    df = pd.read_excel(
        schema["repo_github_user_info"]["file"],
        sheet_name=schema["repo_github_user_info"]["sheet_name"])
-    print(df.head())
+    df.fillna(value=0, inplace=True)
    ctx["repo_github_user_info"] = df
@@ -266,7 +293,7 @@ def load_repo_github_info(config, ctx):
    df = pd.read_excel(
        schema["repo_github_info"]["file"],
        sheet_name=schema["repo_github_info"]["sheet_name"])
-    print(df.head())
+    df.fillna(value=0, inplace=True)
    ctx["repo_github_info"] = df
@@ -274,50 +301,154 @@ def load_repo_csdn_trends(config, ctx):
    schema = config["schema"]
    df = pd.read_excel(
        schema["repo_csdn_trends"]["file"],
-        sheet_name=schema["repo_github_info"]["sheet_name"])
+        sheet_name=schema["repo_csdn_trends"]["sheet_name"])
-    print(df.head())
+    df.fillna(value=0, inplace=True)
    ctx["repo_csdn_trends"] = df
 def load_repo_commit_rank(config, ctx):
    schema = config["schema"]
    df = pd.read_csv(schema["repo_commit_rank"]["file"])
-    print(df.columns)
+    df.fillna(value=0, inplace=True)
    ctx["repo_commit_rank"] = df
 def load_repo_github_active_trends(config, ctx):
    schema = config["schema"]
    df = pd.read_csv(schema["repo_github_active_trends"]["file"])
-    print(df.columns)
+    df.fillna(value=0, inplace=True)
    ctx["repo_github_active_trends"] = df
-def rank_personal_top_n(config, ctx):
+def load_repo_github_popular_trends(config, ctx):
-    pass
+    schema = config["schema"]
+    df = pd.read_csv(schema["repo_github_popular_trends"]["file"])
+    df.fillna(value=0, inplace=True)
+    ctx["repo_github_popular_trends"] = df
-def rank_company_top_n(config, ctx):
+def rank_repo_top_n(config, ctx):
+    repo_rank = []
+    repo_dict = {}
+    # 合并项目的总数据 repo_github_info 主键 FullName
+    df = ctx["repo_github_info"]
+    df.fillna(value=0)
+    for index, row in df.iterrows():
+        repo_item = {}
+        repo_key = row["FullName"].lower()
+        repo_item["region"] = row["Region"]
+        repo_item["star"] = int(row["Star"])
+        repo_item["fork"] = int(row["Fork"])
+        repo_item["contributors"] = int(row["Contributors"])
+        repo_dict[repo_key] = repo_item
+        repo_rank.append(repo_item)
+    # 合并项目的CSDN指数数据 repo_csdn_trends 主键 repo_name
+    df = ctx["repo_csdn_trends"]
+    for index, row in df.iterrows():
+        repo_name = row['repo_name']
+        repo_key = repo_name.lower()
+        repo_item = repo_dict.get(repo_key)
+        repo_item["csdn_index_month_avg"] = row[1:].mean()
+    # 合并项目的活跃变动数据 repo_github_active_trends 主键 repo_name
+    df = ctx["repo_github_active_trends"]
+    df.fillna(value=0)
+    df = df.groupby(["repo_name"]).agg(np.mean)
+    # print(df.loc['TheAlgorithms/Python'])
+    for index, row in df.iterrows():
+        repo_name = index
+        repo_key = repo_name.lower()
+        repo_item = repo_dict.get(repo_key)
+        repo_item["push_count_month_avg"] = row["push_count"]
+        repo_item["pr_count_month_avg"] = row["pr_count"]
+        repo_item["issue_count_month_avg"] = row["issue_count"]
+        repo_item["creator_count_month_avg"] = row["creator_count"]
+    # 合并项目的激活变动数据 repo_github_popular_trends 主键 repo_name
+    df = ctx["repo_github_popular_trends"]
+    df = df.groupby(["repo_name"]).agg(np.mean)
+    for index, row in df.iterrows():
+        repo_name = index
+        repo_key = repo_name.lower()
+        repo_item = repo_dict.get(repo_key)
+        repo_item["watch_count_month_avg"] = row["watch_count"]
+        repo_item["fork_count_month_avg"] = row["fork_count"]
+    # 合并表
+    df = pd.DataFrame.from_dict(repo_dict, orient='index')
+    df.fillna(value=0, inplace=True)
+    df.reset_index()
+    # 计算榜单得分
+    weights = {
+        "star": 1,
+        "fork": 1,
+        "contributors": 1,
+        "csdn_index_month_avg": 1,
+        "push_count_month_avg": 1,
+        "pr_count_month_avg": 1,
+        "issue_count_month_avg": 1,
+        "creator_count_month_avg": 1,
+        "watch_count_month_avg": 1,
+        "fork_count_month_avg": 1
+    }
+    total_weight_value = 0
+    for key in weights:
+        total_weight_value += weights[key]
+    for key in weights:
+        weights[key] = weights[key]/total_weight_value
+    df["score"] = 0
+    for key in weights:
+        df["score"] += df[key].apply(lambda x: x*weights[key])
+    print(df.head())
+    df = df.sort_values(by="score", ascending=False)
+    df.to_csv(config["ranks"]["repo"]["top_n"])
+def rank_personal_top_n(config, ctx):
    pass
-def rank_repo_top_n(config, ctx):
+def rank_company_top_n(config, ctx):
    pass
 def main(config):
    ctx = {}
-    load_repo_github_user_info(config, ctx)
+    # print("@load_repo_github_user_info..")
+    # load_repo_github_user_info(config, ctx)
+    # print("@load_repo_commit_rank..")
+    # load_repo_commit_rank(config, ctx)
+    print("@load_repo_github_info..")
    load_repo_github_info(config, ctx)
+    print("@load_repo_csdn_trends..")
    load_repo_csdn_trends(config, ctx)
-    load_repo_commit_rank(config, ctx)
+    print("@load_repo_github_active_trends..")
    load_repo_github_active_trends(config, ctx)
-    rank_personal_top_n(config, ctx)
+    print("@load_repo_github_popular_trends..")
-    rank_company_top_n(config, ctx)
+    load_repo_github_popular_trends(config, ctx)
+    print("@rank_repo_top_n..")
    rank_repo_top_n(config, ctx)
+    # print("@rank_personal_top_n..")
+    # rank_personal_top_n(config, ctx)
+    # print("@rank_company_top_n..")
+    # rank_company_top_n(config, ctx)
 if __name__ == "__main__":
    main(config)