add personal rank

89cef470 · feilong · 656cf5f7 · 89cef470 · 89cef470 · 89cef470
4 changed file
--- a/ranks/个人向主要开源技术栈贡献榜单.csv
+++ b/ranks/个人向主要开源技术栈贡献榜单.csv
--- a/ranks/开源项目榜.csv
+++ b/ranks/开源项目榜.csv
--- a/src/config.py
+++ b/src/config.py
@@ -103,6 +103,32 @@ config = {
                }
            ]
        },
+        "repo_github_user_commit_info": {
+            "file": "../data/CSDN/commit_analysis.csv",
+            "desc": "开发者在开源项目的贡献数信息",
+            "fields": [
+                {
+                    "field_name": "repo_name",
+                    "field_type": "str",
+                    "desc": "开源项目仓库名字"
+                },
+                {
+                    "field_name": "any_commit_id",
+                    "field_type": "str",
+                    "desc": "开发者在对应开源项目上贡献的任意commit id"
+                },
+                {
+                    "field_name": "actor_email",
+                    "field_type": "str",
+                    "desc": "用户邮箱"
+                },
+                {
+                    "field_name": "total",
+                    "field_type": "int",
+                    "desc": "开发者累计Github项目贡献数"
+                },
+            ]
+        },
        "repo_github_info": {
            "file": "../data/Github-Repos.xlsx",
            "sheet_name": "汇总",

--- a/src/tasks/ranks.py
+++ b/src/tasks/ranks.py
 # -*- coding: utf-8 -*-
+from audioop import reverse
 import os
 import numpy as np
 import pandas as pd
-from config import config
-
-
-def load_repo_github_user_info(config, ctx):
-    schema = config["schema"]
-    df = pd.read_excel(
-        schema["repo_github_user_info"]["file"],
-        sheet_name=schema["repo_github_user_info"]["sheet_name"])
-    df.fillna(value=0, inplace=True)
-    ctx["repo_github_user_info"] = df


 def load_repo_github_info(config, ctx):
@@ -53,6 +44,33 @@ def load_repo_github_popular_trends(config, ctx):
    ctx["repo_github_popular_trends"] = df


+def load_repo_github_user_info(config, ctx):
+
+    # bar = None
+
+    # def show_pd_read_excel_progress(cur, tt):
+    #     global bar
+    #     if bar is None:
+    #         bar = IncrementalBar("@开始加载用户仓库贡献表:", max=tt)
+    #     else:
+    #         bar.next(1)
+
+    schema = config["schema"]
+    df = pd.read_excel(
+        schema["repo_github_user_info"]["file"],
+        sheet_name=schema["repo_github_user_info"]["sheet_name"])
+    df.fillna(value=0, inplace=True)
+    ctx["repo_github_user_info"] = df
+
+
+def load_repo_github_user_commit_info(config, ctx):
+    schema = config["schema"]
+    df = pd.read_csv(
+        schema["repo_github_user_commit_info"]["file"])
+    df.fillna(value=0, inplace=True)
+    ctx["repo_github_user_commit_info"] = df
+
+
 def rank_repo_top_n(config, ctx):
    repo_rank = []
    repo_dict = {}
@@ -136,24 +154,131 @@ def rank_repo_top_n(config, ctx):
    df = df.sort_values(by="score", ascending=False)
    df.to_csv(config["ranks"]["repo"]["top_n"])

+    for index, row in df.iterrows():
+        repo_name = index
+        repo_item = repo_dict[repo_name]
+        repo_item["score"] = row["score"]
+
+    ctx["ranks"]["repo_top_n"] = df
+    ctx["ranks"]["repo_top_n_dict"] = repo_dict
+

 def rank_personal_top_n(config, ctx):
-    pass
+    repo_top_n_dict = ctx["ranks"]["repo_top_n_dict"]

+    # 合并 repo_github_user_info 表，主键是 actor_email
+    personal_dict = {}
+    df = ctx["repo_github_user_info"]
+    for index, row in df.iterrows():
+        email = row["actor_email"]

-def rank_company_top_n(config, ctx):
-    pass
+        # 忽略错误数据
+        if type(email) != type(""):
+            continue

+        person_key = email.lower()
+
+        person = {}
+        personal_dict[person_key] = person
+
+        # 用户基本信息
+        person['avatar_url'] = row['avatar_url']
+        person['name'] = row['name']
+        person['company'] = row['company']
+        person['location'] = row['location']
+        person['followers'] = row['followers']
+        person['author_id'] = row['author_id']
+        person['type'] = row['type']
+        person['login'] = row['login']
+
+        # 用户统计数据
+        person['all_repo_contribute_total'] = int(row['sum_total'])
+
+    # 合并 repo_github_user_info 仓库排行表记录
+    df = ctx["repo_github_user_commit_info"]
+    for index, row in df.iterrows():
+        email = row["actor_email"]
+
+        # 忽略错误数据
+        if type(email) != type(""):
+            continue
+
+        total = row["total"]
+        repo_name = row["repo_name"]
+
+        person_key = email.lower()
+        repo_key = repo_name.lower()
+        repo_item = repo_top_n_dict.get(repo_key)
+        if repo_item is not None:
+            repo_user_contribute_list = repo_item.get('user_contribute_list')
+            if repo_user_contribute_list is None:
+                repo_user_contribute_list = []
+                repo_item['user_contribute_list'] = repo_user_contribute_list
+
+            # 项目的用户贡献者列表增加
+            repo_user_contribute_list.append({
+                "actor_email": email,
+                "total": total,
+                "repo_score": repo_item['score']  # 项目得分
+            })
+
+    # 项目内的用户贡献排序
+    for repo_key in repo_top_n_dict:
+        repo_item = repo_top_n_dict[repo_key]
+        repo_user_contribute_list = repo_item.get('user_contribute_list')
+        if repo_user_contribute_list is not None:
+            repo_user_contribute_list.sort(
+                key=lambda x: x["total"], reverse=True)
+        else:
+            print(f"[warn] missing contributors repo:{repo_key}")
+
+    # 现在，合并每个项目的用户贡献到一个总表
+    personal_score_dict = {}
+    for repo_key in repo_top_n_dict:
+        repo_item = repo_top_n_dict[repo_key]
+        repo_user_contribute_list = repo_item.get('user_contribute_list')
+        if repo_user_contribute_list is None:
+            continue
+        for repo_person in repo_user_contribute_list:
+            email = repo_person['actor_email']
+            person_key = email.lower()
+            person_item = personal_score_dict.get(person_key)
+            if person_item is None:
+                person_item = {
+                    "score": repo_person["total"]*repo_person["repo_score"],
+                }
+                personal_score_dict[person_key] = person_item
+
+                person_info = personal_dict.get(person_key)
+
+                # 忽略不匹配数据
+                if person_info is None:
+                    continue
+
+                person_item['avatar_url'] = person_info['avatar_url']
+                person_item['name'] = person_info['name']
+                person_item['company'] = person_info['company']
+                person_item['location'] = person_info['location']
+                person_item['followers'] = person_info['followers']
+                person_item['author_id'] = person_info['author_id']
+                person_item['type'] = person_info['type']
+                person_item['login'] = person_info['login']
+            else:
+                person_item['score'] += repo_person["total"] * \
+                    repo_person["repo_score"]
+
+    # 排行
+    df = pd.DataFrame.from_dict(personal_score_dict, orient='index')
+    df.sort_values(by="score", ascending=False)
+    df.to_csv(config["ranks"]["personal"]["top_n"])
+    print(df.head())

-def calc_ranks(config, options):
-    ctx = {}

-    # print("@load_repo_github_user_info..")
-    # load_repo_github_user_info(config, ctx)
+def rank_company_top_n(config, ctx):
+    pass

-    # print("@load_repo_commit_rank..")
-    # load_repo_commit_rank(config, ctx)

+def calc_repo_rank(config, options, ctx):
    print("@load_repo_github_info..")
    load_repo_github_info(config, ctx)

@@ -169,8 +294,29 @@ def calc_ranks(config, options):
    print("@rank_repo_top_n..")
    rank_repo_top_n(config, ctx)

-    # print("@rank_personal_top_n..")
-    # rank_personal_top_n(config, ctx)

+def calc_personal_rank(config, options, ctx):
+    print("@load_repo_github_user_info..")
+    load_repo_github_user_info(config, ctx)
+
+    print("@load_repo_github_user_commit_info..")
+    load_repo_github_user_commit_info(config, ctx)
+
+    print("@rank_personal_top_n..")
+    rank_personal_top_n(config, ctx)
+
+
+def calc_company_rank(config, options, ctx):
    # print("@rank_company_top_n..")
    # rank_company_top_n(config, ctx)
+    pass
+
+
+def calc_ranks(config, options):
+    ctx = {
+        "ranks": {}
+    }
+
+    calc_repo_rank(config, options, ctx)
+    calc_personal_rank(config, options, ctx)
+    # calc_company_rank(config, options, ctx)