diff --git a/.gitignore b/.gitignore index 314f02b1bc5939ebb5b1a48c139fd5e56daf9667..e4d055739eb797412507e83758056b2ad87ab5f0 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,3 @@ -*.txt \ No newline at end of file +.DS_Store +*.txt +~$*.xlsx \ No newline at end of file diff --git "a/CSDN/CSDN \346\214\207\346\225\260.xlsx" "b/CSDN/CSDN \346\214\207\346\225\260.xlsx" deleted file mode 100644 index 406d387452df5e619b7597692336cc26aaa79281..0000000000000000000000000000000000000000 Binary files "a/CSDN/CSDN \346\214\207\346\225\260.xlsx" and /dev/null differ diff --git a/CSDN/repo-csdn-trends.xlsx b/CSDN/repo-csdn-trends.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..3bfa854da04a2b220825e060a2cde9fdf320f7d2 Binary files /dev/null and b/CSDN/repo-csdn-trends.xlsx differ diff --git a/src/main.py b/src/main.py index 837e898018d68d1a851459188a5751d230afd8e9..ca67732f27a14148c4002631eae680317b326b26 100644 --- a/src/main.py +++ b/src/main.py @@ -1,20 +1,323 @@ -import imp - - import os import pandas as pd config = { - "csdn_trends": { - "file": "CSDN 指数.xlsx", - "desc": "开源博客项目在CSDN站内词频数据" - } + "ranks": { + "personal": { + "top_n": "../rank/个人向主要开源技术栈贡献榜单.csv", + "top_n_en": "../rank/个人向国际主要开源技术栈贡献榜单.csv", + "top_n_zh_cn": "../rank/个人向中国主要开源技术栈贡献榜单.csv", + }, + "company": { + "top_n": "../rank/公司向主要开源技术栈贡献榜单.csv", + "top_n_en": "../rank/公司向国际主要开源技术栈贡献榜单.csv", + "top_n_zh_cn": "../rank/公司向中国主要开源技术栈贡献榜单.csv", + }, + "repo": { + "top_n": "../rank/开源项目榜.csv", + "top_n_en": "../rank/开源项目榜_非中国项目.csv", + "top_n_zh_cn": "../rank/开源项目榜_中国项目.csv" + } + }, + "schema": { + "repo_github_user_info": { + "file": "../GitHub/Userinfo.xlsx", + "sheet_name": "repo", + "desc": "开源项目Github贡献者信息", + "fields": [ + { + "field_name": "actor_email", + "field_type": "str", + "desc": "用户邮箱" + }, + { + "field_name": "sum_total", + "field_type": "int", + "desc": "用户累计Github项目贡献数" + }, + { + "feild_name": "any_repo_path", + "field_type": "str", + "desc": "用户贡献过的任意一个Github仓库路径" + }, + { + "field_name": "any_commit_id", + "field_type": "str", + "desc": "用户在上述贡献过的Github仓库里的任意一个commit" + }, + { + "field_name": "avatar_url", + "field_type": "str", + "desc": "用户头像" + }, + { + "field_name": "name", + "field_type": "str", + "desc": "用户昵称" + }, + { + "field_name": "company", + "field_type": "用户在Github上填写的公司名称", + "desc": "" + }, + { + "field_name": "location", + "field_type": "str", + "desc": "用户在Github上填写的城市信息" + }, + { + "field_name": "followers", + "field_type": "int", + "desc": "用户的被关注数" + }, + { + "field_name": "author_id", + "field_type": "int", + "desc": "用户Github的ID" + }, + { + "field_name": "type", + "field_type": "str", + "desc": "账号类型,人类或机器人" + }, + { + "field_name": "login", + "field_type": "str", + "desc": "登录名" + }, + { + "field_name": "created_at", + "field_type": "date", + "desc": "创建时间" + }, + { + "field_name": "updated_at", + "field_type": "date", + "desc": "更新时间" + } + ] + }, + "repo_github_info": { + "file": "../Github-Repos.xlsx", + "sheet_name": "汇总", + "desc": "开源项目在Github上的项目交互数据", + "fields": [ + { + "filed_name": "ID", + "field_type": "int", + "desc": "编号" + }, + { + "field_name": "RepoID", + "field_type": "int", + "desc": "开源项目GithubID" + }, + { + "field_name": "Org", + "field_type": "str", + "desc": "开源项目组织" + }, + { + "field_name": "FullName", + "field_type": "str", + "desc": "开源项目全名" + }, + { + "field_name": "Url", + "field_type": "str", + "desc": "开源项目url" + }, + { + "field_name": "Repo", + "field_type": "str", + "desc": "仓库名" + }, + { + "field_name": "Star", + "desc": "开源项目的Github Star 总数", + "field_type": "str" + }, + { + "field_name": "Fork", + "desc": "开源项目的Github Fork 总数", + "field_type": "str" + }, + { + "field_name": "Contributors", + "desc": "开源项目的Github 贡献者 总数", + "field_type": "str" + }, + { + "field_name": "OSC-URL", + "desc": "开源项目的Github CodeChina URL", + "field_type": "str" + }, + { + "field_name": "Region", + "desc": "开源项目所属的区域", + "field_type": "str" + } + ] + }, + "repo_csdn_trends": { + "file": "../CSDN/repo-csdn-trends.xlsx", + "sheet_name": "Sheet1", + "desc": "开源项目在CSDN站内指数数据", + "fields": [ + { + "field_name": "repo_name", + "field_type": "str", + "desc": "开源项目名称" + }, + { + "field_name": "2020-10-01 00:00:00", + "field_type": "int", + "desc": "月份第一天时间" + } + ] + }, + "repo_github_active_trends": { + "file": "../PingCAP/项目活跃度变化.csv", + "desc": "开源项目在Github上的月活跃度数据", + "fields": [ + { + "filed_name": "event_month", + "field_type": "date", + "desc": "交互数据汇总月份" + }, + { + "field_name": "repo_name", + "field_type": "str", + "desc": "开源项目名称" + }, + { + "field_name": "push_count", + "field_type": "int", + "desc": "开源项目当月 git push 次数" + }, + { + "field_name": "pr_count", + "field_type": "int", + "desc": "开源项目当月 git pr 次数" + }, + { + "field_name": "issue_count", + "field_type": "int", + "desc": "开源项目当月 git issue 个数" + }, + { + "field_name": "creator_count", + "field_type": "int", + "desc": "开源项目当月 git操作 创建者总数" + } + ] + }, + "repo_github_popular_trends": { + "file": "../PingCAP/项目受欢迎度变化.csv", + "desc": "开源项目在Github上的月收欢迎程度数据", + "fields": [ + { + "filed_name": "event_month", + "field_type": "date", + "desc": "交互数据汇总月份" + }, + { + "field_name": "repo_name", + "field_type": "str", + "desc": "开源项目名称" + }, + { + "field_name": "push_count", + "field_type": "int", + "desc": "开源项目当月 git push 次数" + }, + { + "field_name": "pr_count", + "field_type": "int", + "desc": "开源项目当月 git pr 次数" + }, + { + "field_name": "issue_count", + "field_type": "int", + "desc": "开源项目当月 git issue 个数" + }, + { + "field_name": "creator_count", + "field_type": "int", + "desc": "开源项目当月 git操作 创建者总数" + } + ] + } + }, } -def main(): +def load_repo_github_user_info(config, ctx): + schema = config["schema"] + df = pd.read_excel( + schema["repo_github_user_info"]["file"], + sheet_name=schema["repo_github_user_info"]["sheet_name"]) + print(df.head()) + ctx["repo_github_user_info"] = df + + +def load_repo_github_info(config, ctx): + schema = config["schema"] + df = pd.read_excel( + schema["repo_github_info"]["file"], + sheet_name=schema["repo_github_info"]["sheet_name"]) + print(df.head()) + ctx["repo_github_info"] = df + + +def load_repo_csdn_trends(config, ctx): + schema = config["schema"] + df = pd.read_excel( + schema["repo_csdn_trends"]["file"], + sheet_name=schema["repo_github_info"]["sheet_name"]) + print(df.head()) + ctx["repo_csdn_trends"] = df + + +def load_repo_commit_rank(config, ctx): + schema = config["schema"] + df = pd.read_csv(schema["repo_commit_rank"]["file"]) + print(df.columns) + ctx["repo_commit_rank"] = df + + +def load_repo_github_active_trends(config, ctx): + schema = config["schema"] + df = pd.read_csv(schema["repo_github_active_trends"]["file"]) + print(df.columns) + ctx["repo_github_active_trends"] = df + + +def rank_personal_top_n(config, ctx): + pass + + +def rank_company_top_n(config, ctx): pass +def rank_repo_top_n(config, ctx): + pass + + +def main(config): + ctx = {} + + load_repo_github_user_info(config, ctx) + load_repo_github_info(config, ctx) + load_repo_csdn_trends(config, ctx) + load_repo_commit_rank(config, ctx) + load_repo_github_active_trends(config, ctx) + + rank_personal_top_n(config, ctx) + rank_company_top_n(config, ctx) + rank_repo_top_n(config, ctx) + + if __name__ == "__main__": - main() + main(config)