# -*- coding: utf-8 -*- import os import numpy as np import pandas as pd config = { "ranks": { "personal": { "top_n": "../ranks/个人向主要开源技术栈贡献榜单.csv", "top_n_en": "../ranks/个人向国际主要开源技术栈贡献榜单.csv", "top_n_zh_cn": "../ranks/个人向中国主要开源技术栈贡献榜单.csv", }, "company": { "top_n": "../ranks/公司向主要开源技术栈贡献榜单.csv", "top_n_en": "../ranks/公司向国际主要开源技术栈贡献榜单.csv", "top_n_zh_cn": "../ranks/公司向中国主要开源技术栈贡献榜单.csv", }, "repo": { "top_n": "../ranks/开源项目榜.csv", "top_n_en": "../ranks/开源项目榜_非中国项目.csv", "top_n_zh_cn": "../ranks/开源项目榜_中国项目.csv" } }, "schema": { "repo_github_user_info": { "file": "../data/GitHub/Userinfo.xlsx", "sheet_name": "repo", "desc": "开源项目Github贡献者信息", "fields": [ { "field_name": "actor_email", "field_type": "str", "desc": "用户邮箱" }, { "field_name": "sum_total", "field_type": "int", "desc": "用户累计Github项目贡献数" }, { "feild_name": "any_repo_path", "field_type": "str", "desc": "用户贡献过的任意一个Github仓库路径" }, { "field_name": "any_commit_id", "field_type": "str", "desc": "用户在上述贡献过的Github仓库里的任意一个commit" }, { "field_name": "avatar_url", "field_type": "str", "desc": "用户头像" }, { "field_name": "name", "field_type": "str", "desc": "用户昵称" }, { "field_name": "company", "field_type": "用户在Github上填写的公司名称", "desc": "" }, { "field_name": "location", "field_type": "str", "desc": "用户在Github上填写的城市信息" }, { "field_name": "followers", "field_type": "int", "desc": "用户的被关注数" }, { "field_name": "author_id", "field_type": "int", "desc": "用户Github的ID" }, { "field_name": "type", "field_type": "str", "desc": "账号类型,人类或机器人" }, { "field_name": "login", "field_type": "str", "desc": "登录名" }, { "field_name": "created_at", "field_type": "date", "desc": "创建时间" }, { "field_name": "updated_at", "field_type": "date", "desc": "更新时间" } ] }, "repo_github_info": { "file": "../data/Github-Repos.xlsx", "sheet_name": "汇总", "desc": "开源项目在Github上的项目交互数据", "fields": [ { "filed_name": "ID", "field_type": "int", "desc": "编号" }, { "field_name": "RepoID", "field_type": "int", "desc": "开源项目GithubID" }, { "field_name": "Org", "field_type": "str", "desc": "开源项目组织" }, { "field_name": "FullName", "field_type": "str", "desc": "开源项目全名" }, { "field_name": "Url", "field_type": "str", "desc": "开源项目url" }, { "field_name": "Repo", "field_type": "str", "desc": "仓库名" }, { "field_name": "Star", "desc": "开源项目的Github Star 总数", "field_type": "str" }, { "field_name": "Fork", "desc": "开源项目的Github Fork 总数", "field_type": "str" }, { "field_name": "Contributors", "desc": "开源项目的Github 贡献者 总数", "field_type": "str" }, { "field_name": "OSC-URL", "desc": "开源项目的Github CodeChina URL", "field_type": "str" }, { "field_name": "Region", "desc": "开源项目所属的区域", "field_type": "str" } ] }, "repo_csdn_trends": { "file": "../data/CSDN/repo-csdn-trends.xlsx", "sheet_name": "Sheet1", "desc": "开源项目在CSDN站内指数数据", "fields": [ { "field_name": "repo_name", "field_type": "str", "desc": "开源项目名称" }, { "field_name": "2020-10-01 00:00:00", "field_type": "int", "desc": "月份第一天时间" } ] }, "repo_commit_rank": { "file": "../data/CSDN/repo-commit-rank.csv", "desc": "开源项目在Github的月commit变化", "fields": [ { "field_name": "actor_email", "field_type": "str", "desc": "用户邮箱" }, { "field_name": "sum_total", "field_type": "int", "desc": "用户累计Github项目贡献数" }, { "feild_name": "any_repo_path", "field_type": "str", "desc": "用户贡献过的任意一个Github仓库路径" }, { "field_name": "any_commit_id", "field_type": "str", "desc": "用户在上述贡献过的Github仓库里的任意一个commit" } ] }, "repo_github_active_trends": { "file": "../data/PingCAP/项目活跃度变化.csv", "desc": "开源项目在Github上的月活跃度数据", "fields": [ { "filed_name": "event_month", "field_type": "date", "desc": "交互数据汇总月份" }, { "field_name": "repo_name", "field_type": "str", "desc": "开源项目名称" }, { "field_name": "push_count", "field_type": "int", "desc": "开源项目当月 git push 次数" }, { "field_name": "pr_count", "field_type": "int", "desc": "开源项目当月 git pr 次数" }, { "field_name": "issue_count", "field_type": "int", "desc": "开源项目当月 git issue 个数" }, { "field_name": "creator_count", "field_type": "int", "desc": "开源项目当月 git操作 创建者总数" } ] }, "repo_github_popular_trends": { "file": "../data/PingCAP/项目受欢迎度变化.csv", "desc": "开源项目在Github上的月收欢迎程度数据", "fields": [ { "filed_name": "event_month", "field_type": "date", "desc": "交互数据汇总月份" }, { "field_name": "repo_name", "field_type": "str", "desc": "开源项目名称" }, { "field_name": "push_count", "field_type": "int", "desc": "开源项目当月 git push 次数" }, { "field_name": "pr_count", "field_type": "int", "desc": "开源项目当月 git pr 次数" }, { "field_name": "issue_count", "field_type": "int", "desc": "开源项目当月 git issue 个数" }, { "field_name": "creator_count", "field_type": "int", "desc": "开源项目当月 git操作 创建者总数" } ] } }, } def load_repo_github_user_info(config, ctx): schema = config["schema"] df = pd.read_excel( schema["repo_github_user_info"]["file"], sheet_name=schema["repo_github_user_info"]["sheet_name"]) df.fillna(value=0, inplace=True) ctx["repo_github_user_info"] = df def load_repo_github_info(config, ctx): schema = config["schema"] df = pd.read_excel( schema["repo_github_info"]["file"], sheet_name=schema["repo_github_info"]["sheet_name"]) df.fillna(value=0, inplace=True) ctx["repo_github_info"] = df def load_repo_csdn_trends(config, ctx): schema = config["schema"] df = pd.read_excel( schema["repo_csdn_trends"]["file"], sheet_name=schema["repo_csdn_trends"]["sheet_name"]) df.fillna(value=0, inplace=True) ctx["repo_csdn_trends"] = df def load_repo_commit_rank(config, ctx): schema = config["schema"] df = pd.read_csv(schema["repo_commit_rank"]["file"]) df.fillna(value=0, inplace=True) ctx["repo_commit_rank"] = df def load_repo_github_active_trends(config, ctx): schema = config["schema"] df = pd.read_csv(schema["repo_github_active_trends"]["file"]) df.fillna(value=0, inplace=True) ctx["repo_github_active_trends"] = df def load_repo_github_popular_trends(config, ctx): schema = config["schema"] df = pd.read_csv(schema["repo_github_popular_trends"]["file"]) df.fillna(value=0, inplace=True) ctx["repo_github_popular_trends"] = df def rank_repo_top_n(config, ctx): repo_rank = [] repo_dict = {} # 合并项目的总数据 repo_github_info 主键 FullName df = ctx["repo_github_info"] df.fillna(value=0) for index, row in df.iterrows(): repo_item = {} repo_key = row["FullName"].lower() repo_item["region"] = row["Region"] repo_item["star"] = int(row["Star"]) repo_item["fork"] = int(row["Fork"]) repo_item["contributors"] = int(row["Contributors"]) repo_dict[repo_key] = repo_item repo_rank.append(repo_item) # 合并项目的CSDN指数数据 repo_csdn_trends 主键 repo_name df = ctx["repo_csdn_trends"] for index, row in df.iterrows(): repo_name = row['repo_name'] repo_key = repo_name.lower() repo_item = repo_dict.get(repo_key) repo_item["csdn_index_month_avg"] = row[1:].mean() # 合并项目的活跃变动数据 repo_github_active_trends 主键 repo_name df = ctx["repo_github_active_trends"] df.fillna(value=0) df = df.groupby(["repo_name"]).agg(np.mean) # print(df.loc['TheAlgorithms/Python']) for index, row in df.iterrows(): repo_name = index repo_key = repo_name.lower() repo_item = repo_dict.get(repo_key) repo_item["push_count_month_avg"] = row["push_count"] repo_item["pr_count_month_avg"] = row["pr_count"] repo_item["issue_count_month_avg"] = row["issue_count"] repo_item["creator_count_month_avg"] = row["creator_count"] # 合并项目的激活变动数据 repo_github_popular_trends 主键 repo_name df = ctx["repo_github_popular_trends"] df = df.groupby(["repo_name"]).agg(np.mean) for index, row in df.iterrows(): repo_name = index repo_key = repo_name.lower() repo_item = repo_dict.get(repo_key) repo_item["watch_count_month_avg"] = row["watch_count"] repo_item["fork_count_month_avg"] = row["fork_count"] # 合并表 df = pd.DataFrame.from_dict(repo_dict, orient='index') df.fillna(value=0, inplace=True) df.reset_index() # 计算榜单得分 weights = { "star": 1, "fork": 1, "contributors": 1, "csdn_index_month_avg": 1, "push_count_month_avg": 1, "pr_count_month_avg": 1, "issue_count_month_avg": 1, "creator_count_month_avg": 1, "watch_count_month_avg": 1, "fork_count_month_avg": 1 } total_weight_value = 0 for key in weights: total_weight_value += weights[key] for key in weights: weights[key] = weights[key]/total_weight_value df["score"] = 0 for key in weights: df["score"] += df[key].apply(lambda x: x*weights[key]) print(df.head()) df = df.sort_values(by="score", ascending=False) df.to_csv(config["ranks"]["repo"]["top_n"]) def rank_personal_top_n(config, ctx): pass def rank_company_top_n(config, ctx): pass def main(config): ctx = {} # print("@load_repo_github_user_info..") # load_repo_github_user_info(config, ctx) # print("@load_repo_commit_rank..") # load_repo_commit_rank(config, ctx) print("@load_repo_github_info..") load_repo_github_info(config, ctx) print("@load_repo_csdn_trends..") load_repo_csdn_trends(config, ctx) print("@load_repo_github_active_trends..") load_repo_github_active_trends(config, ctx) print("@load_repo_github_popular_trends..") load_repo_github_popular_trends(config, ctx) print("@rank_repo_top_n..") rank_repo_top_n(config, ctx) # print("@rank_personal_top_n..") # rank_personal_top_n(config, ctx) # print("@rank_company_top_n..") # rank_company_top_n(config, ctx) if __name__ == "__main__": main(config)