fix repo country

21b2ed0e · feilong · 067be799 · 21b2ed0e · 21b2ed0e · 21b2ed0e
9 changed file
--- a/ranks/个人向中国主要开源技术栈贡献榜单.csv
+++ b/ranks/个人向中国主要开源技术栈贡献榜单.csv
--- a/ranks/个人向国际主要开源技术栈贡献榜单.csv
+++ b/ranks/个人向国际主要开源技术栈贡献榜单.csv
--- a/ranks/公司向中国主要开源技术栈贡献榜单.csv
+++ b/ranks/公司向中国主要开源技术栈贡献榜单.csv
--- a/ranks/开源项目_用户组成.csv
+++ b/ranks/开源项目_用户组成.csv
--- a/ranks/开源项目榜_中国项目.csv
+++ b/ranks/开源项目榜_中国项目.csv
--- a/ranks/开源项目榜_国际项目.csv
+++ b/ranks/开源项目榜_国际项目.csv
--- a/ranks/开源项目榜_非中国项目.csv
+++ b/ranks/开源项目榜_非中国项目.csv
--- a/src/config.py
+++ b/src/config.py
@@ -21,8 +21,9 @@ config = {
        },
        "repo": {
            "top_n": "../ranks/开源项目榜.csv",
-            "top_n_en": "../ranks/开源项目榜_非中国项目.csv",
+            "top_n_en": "../ranks/开源项目榜_国际项目.csv",
-            "top_n_zh_cn": "../ranks/开源项目榜_中国项目.csv"
+            "top_n_zh_cn": "../ranks/开源项目榜_中国项目.csv",
+            "top_n_user_distribute": "../ranks/开源项目_用户组成.csv",
        }
    },
    "schema": {

--- a/src/tasks/ranks.py
+++ b/src/tasks/ranks.py
@@ -264,36 +264,172 @@ def rank_personal_top_n(config, ctx):
    print(f"exit_person_count:{exit_person_count}")
    # 项目内的用户贡献排序
+    repo_china_user_avg_factor = 0
+    repo_china_user_avg_factor_c = 0
+    repo_china_user_commit_avg_factor = 0
+    repo_china_user_commit_avg_factor_c = 0
    for repo_key in repo_top_n_dict:
        repo_item = repo_top_n_dict[repo_key]
        repo_user_contribute_list = repo_item.get('user_contribute_list')
+        if repo_item['region']!="国产":
+            continue
        if repo_user_contribute_list is not None:
            repo_user_contribute_list.sort(
                key=lambda x: x["total"], reverse=True)
-            # 根据项目贡献者的国别信息统计，来决定项目的国别
+            # 统计项目内用户分布
-            s = {
+            users_stat = {
+                "china":0,
+                "international": 0,
+                "Null": 0,
+                "all": len(repo_user_contribute_list)
+            }
+            for u in repo_user_contribute_list:
+                users_stat[u["country"]]+=1
+            # 统计项目内用户贡献分布
+            commits_stat = {
+                "china":0,
+                "international": 0,
+                "Null": 0,
+            }
+            for u in repo_user_contribute_list:
+                commits_stat[u["country"]]+=u["total"]
+            commits_stat["all"] = commits_stat["china"] + commits_stat["Null"] + commits_stat["international"]
+            repo_china_user_avg_factor += users_stat["china"]/users_stat["all"]
+            repo_china_user_avg_factor_c += 1
+            repo_china_user_commit_avg_factor += commits_stat["china"]/commits_stat["all"]
+            repo_china_user_commit_avg_factor_c +=1
+    repo_china_user_avg_factor = repo_china_user_avg_factor/repo_china_user_avg_factor_c
+    repo_china_user_commit_avg_factor = repo_china_user_commit_avg_factor/repo_china_user_commit_avg_factor_c
+    repo_user_distribute_dict = {}
+    for repo_key in repo_top_n_dict:
+        repo_item = repo_top_n_dict[repo_key]
+        repo_user_contribute_list = repo_item.get('user_contribute_list')
+        if repo_user_contribute_list is not None:
+            repo_user_contribute_list.sort(
+                key=lambda x: x["total"], reverse=True)
+            # 统计项目内用户分布
+            users_stat = {
                "china":0,
                "international": 0,
                "Null": 0,
                "all": len(repo_user_contribute_list)
            }
            for u in repo_user_contribute_list:
-                s[u["country"]]+=1
+                users_stat[u["country"]]+=1
+            # 统计项目内用户贡献分布
+            commits_stat = {
+                "china":0,
+                "international": 0,
+                "Null": 0,
+            }
+            for u in repo_user_contribute_list:
+                commits_stat[u["country"]]+=u["total"]
+            commits_stat["all"] = commits_stat["china"] + commits_stat["Null"] + commits_stat["international"]
+            # 根据项目贡献者的国别信息统计，来决定项目的国别
            # 如果项目内的成员，在国别上占主导优势（超过50%）
            # 就判定该项目为对应的国别
-            if s["Null"]/s["all"]>0.5:
+            user_factor = users_stat["china"]/users_stat["all"]
+            commit_factor =commits_stat["china"]/ commits_stat["all"]
+            # ----------------------------------------------
+            #
+            # 计算项目是否是国产，请按需调参数
+            #
+            # ----------------------------------------------
+            # 是否采用标注国产的信息
+            use_label_as_china = True
+            if users_stat["Null"]/users_stat["all"]>0.5:
+                # 如果标注为国产，再看下中国用户数做进一步分类
+                if repo_item["region"]=="国产":
+                    if use_label_as_china:
+                        repo_item["country"] = "china"
+                    else:
+                        repo_item["country"] = "Null"
+                else:
                    repo_item["country"] = "Null"
            else:
-                if (s["china"]+s["Null"])/s["all"]>0.5:
+                # 为了不闹国际笑话，我们还是判定项目是中国项目的时候，严格一点
+                # 中国贡献者占比 或者 中国用户commit数超过 一定比例
+                if user_factor>0.4 or commit_factor>0.4:
                    repo_item["country"] = "china"
+                else:
+                    if repo_item["region"]=="国产":
+                        if use_label_as_china:
+                            repo_item["country"] = "china"
+                        else:
+                            repo_item["country"] = "international"
                    else:
                        repo_item["country"] = "international"
+            # ----------------------------------------------
+            repo_user_distribute_dict[repo_key] = {
+                "基于用户百分比计算国别": repo_item["country"],
+                "全部开发者": users_stat["all"],
+                "全部开发者_commits": commits_stat["all"],
+                "未知开发者": users_stat["Null"],
+                "未知开发者_commits": commits_stat["Null"],
+                "中国开发者": users_stat["china"],
+                "中国开发者_commits": commits_stat["china"],
+                "国际开发者": users_stat["international"],
+                "国际开发者_commits": commits_stat["international"],
+            }
        else:
+            repo_user_distribute_dict[repo_key] = {
+                "基于用户百分比计算国别": "未知",
+                "全部开发者": "未知",
+                "全部开发者_commits": "未知",
+                "未知开发者": "未知",
+                "未知开发者_commits": "未知",
+                "中国开发者": "未知",
+                "中国开发者_commits": "未知",
+                "国际开发者": "未知",
+                "国际开发者_commits": "未知",
+            }
            # print(f"[warn] missing contributors repo:{repo_key}")
            pass
+    # 输出开源项目——用户组成分析
+    repo_user_distribute_df = pd.DataFrame.from_dict(repo_user_distribute_dict, orient='index')
+    repo_user_distribute_df.to_csv(config["ranks"]["repo"]["top_n_user_distribute"])
+    ctx["ranks"]["top_n_repo_user_distribute"] = repo_user_distribute_df
+    # 输出开源项目-中国/国际榜
+    repo_top_n = pd.DataFrame.from_dict(repo_top_n_dict, orient='index')
+    repo_top_n.reset_index()
+    repo_top_n_zh_cn = repo_top_n[repo_top_n["country"]=="china"]
+    repo_top_n_zh_cn.sort_values(by="score", ascending=False, inplace=True)
+    repo_top_n_zh_cn.to_csv(config["ranks"]["repo"]["top_n_zh_cn"])
+    ctx["ranks"]["repo_top_n_zh_cn"] = repo_top_n_zh_cn
+    repo_top_n_en = repo_top_n[repo_top_n["country"]=="international"]
+    repo_top_n_en.sort_values(by="score", ascending=False, inplace=True)
+    repo_top_n_en.to_csv(config["ranks"]["repo"]["top_n_en"])
+    ctx["ranks"]["repo_top_n_en"] = repo_top_n_zh_cn
    # 现在，合并每个项目的用户贡献到一个总表
    personal_score_dict = {}