提交 21b2ed0e 编写于 作者: F feilong

fix repo country

上级 067be799
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
......@@ -21,8 +21,9 @@ config = {
},
"repo": {
"top_n": "../ranks/开源项目榜.csv",
"top_n_en": "../ranks/开源项目榜_非中国项目.csv",
"top_n_zh_cn": "../ranks/开源项目榜_中国项目.csv"
"top_n_en": "../ranks/开源项目榜_国际项目.csv",
"top_n_zh_cn": "../ranks/开源项目榜_中国项目.csv",
"top_n_user_distribute": "../ranks/开源项目_用户组成.csv",
}
},
"schema": {
......
......@@ -264,36 +264,172 @@ def rank_personal_top_n(config, ctx):
print(f"exit_person_count:{exit_person_count}")
# 项目内的用户贡献排序
repo_china_user_avg_factor = 0
repo_china_user_avg_factor_c = 0
repo_china_user_commit_avg_factor = 0
repo_china_user_commit_avg_factor_c = 0
for repo_key in repo_top_n_dict:
repo_item = repo_top_n_dict[repo_key]
repo_user_contribute_list = repo_item.get('user_contribute_list')
if repo_item['region']!="国产":
continue
if repo_user_contribute_list is not None:
repo_user_contribute_list.sort(
key=lambda x: x["total"], reverse=True)
# 根据项目贡献者的国别信息统计,来决定项目的国别
s = {
# 统计项目内用户分布
users_stat = {
"china":0,
"international": 0,
"Null": 0,
"all": len(repo_user_contribute_list)
}
for u in repo_user_contribute_list:
users_stat[u["country"]]+=1
# 统计项目内用户贡献分布
commits_stat = {
"china":0,
"international": 0,
"Null": 0,
}
for u in repo_user_contribute_list:
commits_stat[u["country"]]+=u["total"]
commits_stat["all"] = commits_stat["china"] + commits_stat["Null"] + commits_stat["international"]
repo_china_user_avg_factor += users_stat["china"]/users_stat["all"]
repo_china_user_avg_factor_c += 1
repo_china_user_commit_avg_factor += commits_stat["china"]/commits_stat["all"]
repo_china_user_commit_avg_factor_c +=1
repo_china_user_avg_factor = repo_china_user_avg_factor/repo_china_user_avg_factor_c
repo_china_user_commit_avg_factor = repo_china_user_commit_avg_factor/repo_china_user_commit_avg_factor_c
repo_user_distribute_dict = {}
for repo_key in repo_top_n_dict:
repo_item = repo_top_n_dict[repo_key]
repo_user_contribute_list = repo_item.get('user_contribute_list')
if repo_user_contribute_list is not None:
repo_user_contribute_list.sort(
key=lambda x: x["total"], reverse=True)
# 统计项目内用户分布
users_stat = {
"china":0,
"international": 0,
"Null": 0,
"all": len(repo_user_contribute_list)
}
for u in repo_user_contribute_list:
s[u["country"]]+=1
users_stat[u["country"]]+=1
# 统计项目内用户贡献分布
commits_stat = {
"china":0,
"international": 0,
"Null": 0,
}
for u in repo_user_contribute_list:
commits_stat[u["country"]]+=u["total"]
commits_stat["all"] = commits_stat["china"] + commits_stat["Null"] + commits_stat["international"]
# 根据项目贡献者的国别信息统计,来决定项目的国别
# 如果项目内的成员,在国别上占主导优势(超过50%)
# 就判定该项目为对应的国别
if s["Null"]/s["all"]>0.5:
repo_item["country"] = "Null"
user_factor = users_stat["china"]/users_stat["all"]
commit_factor =commits_stat["china"]/ commits_stat["all"]
# ----------------------------------------------
#
# 计算项目是否是国产,请按需调参数
#
# ----------------------------------------------
# 是否采用标注国产的信息
use_label_as_china = True
if users_stat["Null"]/users_stat["all"]>0.5:
# 如果标注为国产,再看下中国用户数做进一步分类
if repo_item["region"]=="国产":
if use_label_as_china:
repo_item["country"] = "china"
else:
repo_item["country"] = "Null"
else:
repo_item["country"] = "Null"
else:
if (s["china"]+s["Null"])/s["all"]>0.5:
# 为了不闹国际笑话,我们还是判定项目是中国项目的时候,严格一点
# 中国贡献者占比 或者 中国用户commit数超过 一定比例
if user_factor>0.4 or commit_factor>0.4:
repo_item["country"] = "china"
else:
repo_item["country"] = "international"
if repo_item["region"]=="国产":
if use_label_as_china:
repo_item["country"] = "china"
else:
repo_item["country"] = "international"
else:
repo_item["country"] = "international"
# ----------------------------------------------
repo_user_distribute_dict[repo_key] = {
"基于用户百分比计算国别": repo_item["country"],
"全部开发者": users_stat["all"],
"全部开发者_commits": commits_stat["all"],
"未知开发者": users_stat["Null"],
"未知开发者_commits": commits_stat["Null"],
"中国开发者": users_stat["china"],
"中国开发者_commits": commits_stat["china"],
"国际开发者": users_stat["international"],
"国际开发者_commits": commits_stat["international"],
}
else:
repo_user_distribute_dict[repo_key] = {
"基于用户百分比计算国别": "未知",
"全部开发者": "未知",
"全部开发者_commits": "未知",
"未知开发者": "未知",
"未知开发者_commits": "未知",
"中国开发者": "未知",
"中国开发者_commits": "未知",
"国际开发者": "未知",
"国际开发者_commits": "未知",
}
# print(f"[warn] missing contributors repo:{repo_key}")
pass
# 输出开源项目——用户组成分析
repo_user_distribute_df = pd.DataFrame.from_dict(repo_user_distribute_dict, orient='index')
repo_user_distribute_df.to_csv(config["ranks"]["repo"]["top_n_user_distribute"])
ctx["ranks"]["top_n_repo_user_distribute"] = repo_user_distribute_df
# 输出开源项目-中国/国际榜
repo_top_n = pd.DataFrame.from_dict(repo_top_n_dict, orient='index')
repo_top_n.reset_index()
repo_top_n_zh_cn = repo_top_n[repo_top_n["country"]=="china"]
repo_top_n_zh_cn.sort_values(by="score", ascending=False, inplace=True)
repo_top_n_zh_cn.to_csv(config["ranks"]["repo"]["top_n_zh_cn"])
ctx["ranks"]["repo_top_n_zh_cn"] = repo_top_n_zh_cn
repo_top_n_en = repo_top_n[repo_top_n["country"]=="international"]
repo_top_n_en.sort_values(by="score", ascending=False, inplace=True)
repo_top_n_en.to_csv(config["ranks"]["repo"]["top_n_en"])
ctx["ranks"]["repo_top_n_en"] = repo_top_n_zh_cn
# 现在,合并每个项目的用户贡献到一个总表
personal_score_dict = {}
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册