提交 21b2ed0e 编写于 作者: F feilong

fix repo country

上级 067be799
因为 它太大了无法显示 source diff 。你可以改为 查看blob
因为 它太大了无法显示 source diff 。你可以改为 查看blob
因为 它太大了无法显示 source diff 。你可以改为 查看blob
因为 它太大了无法显示 source diff 。你可以改为 查看blob
因为 它太大了无法显示 source diff 。你可以改为 查看blob
因为 它太大了无法显示 source diff 。你可以改为 查看blob
因为 它太大了无法显示 source diff 。你可以改为 查看blob
...@@ -21,8 +21,9 @@ config = { ...@@ -21,8 +21,9 @@ config = {
}, },
"repo": { "repo": {
"top_n": "../ranks/开源项目榜.csv", "top_n": "../ranks/开源项目榜.csv",
"top_n_en": "../ranks/开源项目榜_非中国项目.csv", "top_n_en": "../ranks/开源项目榜_国际项目.csv",
"top_n_zh_cn": "../ranks/开源项目榜_中国项目.csv" "top_n_zh_cn": "../ranks/开源项目榜_中国项目.csv",
"top_n_user_distribute": "../ranks/开源项目_用户组成.csv",
} }
}, },
"schema": { "schema": {
......
...@@ -264,36 +264,172 @@ def rank_personal_top_n(config, ctx): ...@@ -264,36 +264,172 @@ def rank_personal_top_n(config, ctx):
print(f"exit_person_count:{exit_person_count}") print(f"exit_person_count:{exit_person_count}")
# 项目内的用户贡献排序 # 项目内的用户贡献排序
repo_china_user_avg_factor = 0
repo_china_user_avg_factor_c = 0
repo_china_user_commit_avg_factor = 0
repo_china_user_commit_avg_factor_c = 0
for repo_key in repo_top_n_dict: for repo_key in repo_top_n_dict:
repo_item = repo_top_n_dict[repo_key] repo_item = repo_top_n_dict[repo_key]
repo_user_contribute_list = repo_item.get('user_contribute_list') repo_user_contribute_list = repo_item.get('user_contribute_list')
if repo_item['region']!="国产":
continue
if repo_user_contribute_list is not None: if repo_user_contribute_list is not None:
repo_user_contribute_list.sort( repo_user_contribute_list.sort(
key=lambda x: x["total"], reverse=True) key=lambda x: x["total"], reverse=True)
# 根据项目贡献者的国别信息统计,来决定项目的国别 # 统计项目内用户分布
s = { users_stat = {
"china":0,
"international": 0,
"Null": 0,
"all": len(repo_user_contribute_list)
}
for u in repo_user_contribute_list:
users_stat[u["country"]]+=1
# 统计项目内用户贡献分布
commits_stat = {
"china":0,
"international": 0,
"Null": 0,
}
for u in repo_user_contribute_list:
commits_stat[u["country"]]+=u["total"]
commits_stat["all"] = commits_stat["china"] + commits_stat["Null"] + commits_stat["international"]
repo_china_user_avg_factor += users_stat["china"]/users_stat["all"]
repo_china_user_avg_factor_c += 1
repo_china_user_commit_avg_factor += commits_stat["china"]/commits_stat["all"]
repo_china_user_commit_avg_factor_c +=1
repo_china_user_avg_factor = repo_china_user_avg_factor/repo_china_user_avg_factor_c
repo_china_user_commit_avg_factor = repo_china_user_commit_avg_factor/repo_china_user_commit_avg_factor_c
repo_user_distribute_dict = {}
for repo_key in repo_top_n_dict:
repo_item = repo_top_n_dict[repo_key]
repo_user_contribute_list = repo_item.get('user_contribute_list')
if repo_user_contribute_list is not None:
repo_user_contribute_list.sort(
key=lambda x: x["total"], reverse=True)
# 统计项目内用户分布
users_stat = {
"china":0, "china":0,
"international": 0, "international": 0,
"Null": 0, "Null": 0,
"all": len(repo_user_contribute_list) "all": len(repo_user_contribute_list)
} }
for u in repo_user_contribute_list: for u in repo_user_contribute_list:
s[u["country"]]+=1 users_stat[u["country"]]+=1
# 统计项目内用户贡献分布
commits_stat = {
"china":0,
"international": 0,
"Null": 0,
}
for u in repo_user_contribute_list:
commits_stat[u["country"]]+=u["total"]
commits_stat["all"] = commits_stat["china"] + commits_stat["Null"] + commits_stat["international"]
# 根据项目贡献者的国别信息统计,来决定项目的国别
# 如果项目内的成员,在国别上占主导优势(超过50%) # 如果项目内的成员,在国别上占主导优势(超过50%)
# 就判定该项目为对应的国别 # 就判定该项目为对应的国别
if s["Null"]/s["all"]>0.5: user_factor = users_stat["china"]/users_stat["all"]
commit_factor =commits_stat["china"]/ commits_stat["all"]
# ----------------------------------------------
#
# 计算项目是否是国产,请按需调参数
#
# ----------------------------------------------
# 是否采用标注国产的信息
use_label_as_china = True
if users_stat["Null"]/users_stat["all"]>0.5:
# 如果标注为国产,再看下中国用户数做进一步分类
if repo_item["region"]=="国产":
if use_label_as_china:
repo_item["country"] = "china"
else:
repo_item["country"] = "Null"
else:
repo_item["country"] = "Null" repo_item["country"] = "Null"
else: else:
if (s["china"]+s["Null"])/s["all"]>0.5: # 为了不闹国际笑话,我们还是判定项目是中国项目的时候,严格一点
# 中国贡献者占比 或者 中国用户commit数超过 一定比例
if user_factor>0.4 or commit_factor>0.4:
repo_item["country"] = "china" repo_item["country"] = "china"
else:
if repo_item["region"]=="国产":
if use_label_as_china:
repo_item["country"] = "china"
else:
repo_item["country"] = "international"
else: else:
repo_item["country"] = "international" repo_item["country"] = "international"
# ----------------------------------------------
repo_user_distribute_dict[repo_key] = {
"基于用户百分比计算国别": repo_item["country"],
"全部开发者": users_stat["all"],
"全部开发者_commits": commits_stat["all"],
"未知开发者": users_stat["Null"],
"未知开发者_commits": commits_stat["Null"],
"中国开发者": users_stat["china"],
"中国开发者_commits": commits_stat["china"],
"国际开发者": users_stat["international"],
"国际开发者_commits": commits_stat["international"],
}
else: else:
repo_user_distribute_dict[repo_key] = {
"基于用户百分比计算国别": "未知",
"全部开发者": "未知",
"全部开发者_commits": "未知",
"未知开发者": "未知",
"未知开发者_commits": "未知",
"中国开发者": "未知",
"中国开发者_commits": "未知",
"国际开发者": "未知",
"国际开发者_commits": "未知",
}
# print(f"[warn] missing contributors repo:{repo_key}") # print(f"[warn] missing contributors repo:{repo_key}")
pass pass
# 输出开源项目——用户组成分析
repo_user_distribute_df = pd.DataFrame.from_dict(repo_user_distribute_dict, orient='index')
repo_user_distribute_df.to_csv(config["ranks"]["repo"]["top_n_user_distribute"])
ctx["ranks"]["top_n_repo_user_distribute"] = repo_user_distribute_df
# 输出开源项目-中国/国际榜
repo_top_n = pd.DataFrame.from_dict(repo_top_n_dict, orient='index')
repo_top_n.reset_index()
repo_top_n_zh_cn = repo_top_n[repo_top_n["country"]=="china"]
repo_top_n_zh_cn.sort_values(by="score", ascending=False, inplace=True)
repo_top_n_zh_cn.to_csv(config["ranks"]["repo"]["top_n_zh_cn"])
ctx["ranks"]["repo_top_n_zh_cn"] = repo_top_n_zh_cn
repo_top_n_en = repo_top_n[repo_top_n["country"]=="international"]
repo_top_n_en.sort_values(by="score", ascending=False, inplace=True)
repo_top_n_en.to_csv(config["ranks"]["repo"]["top_n_en"])
ctx["ranks"]["repo_top_n_en"] = repo_top_n_zh_cn
# 现在,合并每个项目的用户贡献到一个总表 # 现在,合并每个项目的用户贡献到一个总表
personal_score_dict = {} personal_score_dict = {}
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册