diff --git a/.gitignore b/.gitignore index e4d055739eb797412507e83758056b2ad87ab5f0..1ba5c70414031c6f7fe6a4705299523e6079d306 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,5 @@ .DS_Store *.txt -~$*.xlsx \ No newline at end of file +~$*.xlsx +__pycache__ +!requirement.txt \ No newline at end of file diff --git a/README.md b/README.md index f64950ad304b6e4fb93efa901d24690fea72582c..e7b3bfee71acd7a6c59a0d70bd38b044ec6ad3c4 100644 --- a/README.md +++ b/README.md @@ -15,6 +15,13 @@ * `ranks/` 目录下是榜单数据输出目录 * `src/` 是数据获取和榜单计算源代码目录,其中榜单计算代码是:`src/main.py` +## 项目配置和运行 + +1. python 3 环境 +2. cd src/ +3. `pip install -r requirements.txt` +4. `python main.py -a task.ransks` + ## 数据源 1. **Apache 基金会项目** ,GitHub Apache 组织下的所有项目,通过 GitHub API 获取,每个 Json 文件含 100 个项目; diff --git a/src/common/cli.py b/src/common/cli.py new file mode 100644 index 0000000000000000000000000000000000000000..8e3a30d2420dade46306ed49310e982be73cb56e --- /dev/null +++ b/src/common/cli.py @@ -0,0 +1,52 @@ +# -*- coding: UTF-8 -*- + +''' +命令行分发路由 +''' + + +def dispatch(actions, targets): + """ 分发命令行 action """ + + action_len = len(actions) + print(action_len) + + if action_len < 2: + if targets.get('run') != None: + print(f"[命令路由执行]:", '->'.join(actions)) + targets['run']() + else: + print('action not found') + return + + index = 0 + next = targets + action = actions[index] + print(f"[命令路由中..]: {actions[0]}") + + print(actions) + while action_len >= index: + if type(next) == type({}): + if index == action_len: + if next.get('run') is not None: + print(f"[命令路由执行]:", '->'.join(actions)) + next['run']() + break + else: + print('not found') + + action = actions[index] + if next.get(action) is not None: + print(f"[命令路由中..]: {action}") + next = next[action] + + index += 1 + else: + print("[命令路由错误]: 未找到支持的命令行路由:", + '->'.join(actions), ", obj:", next) + index += 1 + else: + print(f"[命令路由执行]:", '->'.join(actions)) + next() + index += 1 + break diff --git a/src/common/json.py b/src/common/json.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/src/common/util.py b/src/common/util.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/src/config.py b/src/config.py new file mode 100644 index 0000000000000000000000000000000000000000..323b40d89baa1e799245f921e02a4c97e2e82d0c --- /dev/null +++ b/src/config.py @@ -0,0 +1,275 @@ +config = { + "ranks": { + "personal": { + "top_n": "../ranks/个人向主要开源技术栈贡献榜单.csv", + "top_n_en": "../ranks/个人向国际主要开源技术栈贡献榜单.csv", + "top_n_zh_cn": "../ranks/个人向中国主要开源技术栈贡献榜单.csv", + }, + "company": { + "top_n": "../ranks/公司向主要开源技术栈贡献榜单.csv", + "top_n_en": "../ranks/公司向国际主要开源技术栈贡献榜单.csv", + "top_n_zh_cn": "../ranks/公司向中国主要开源技术栈贡献榜单.csv", + }, + "repo": { + "top_n": "../ranks/开源项目榜.csv", + "top_n_en": "../ranks/开源项目榜_非中国项目.csv", + "top_n_zh_cn": "../ranks/开源项目榜_中国项目.csv" + } + }, + "schema": { + "repo_github_user_info": { + "file": "../data/GitHub/Userinfo.xlsx", + "sheet_name": "repo", + "desc": "开源项目Github贡献者信息", + "fields": [ + { + "field_name": "actor_email", + "field_type": "str", + "desc": "用户邮箱" + }, + { + "field_name": "sum_total", + "field_type": "int", + "desc": "用户累计Github项目贡献数" + }, + { + "feild_name": "any_repo_path", + "field_type": "str", + "desc": "用户贡献过的任意一个Github仓库路径" + }, + { + "field_name": "any_commit_id", + "field_type": "str", + "desc": "用户在上述贡献过的Github仓库里的任意一个commit" + }, + { + "field_name": "avatar_url", + "field_type": "str", + "desc": "用户头像" + }, + { + "field_name": "name", + "field_type": "str", + "desc": "用户昵称" + }, + { + "field_name": "company", + "field_type": "用户在Github上填写的公司名称", + "desc": "" + }, + { + "field_name": "location", + "field_type": "str", + "desc": "用户在Github上填写的城市信息" + }, + { + "field_name": "followers", + "field_type": "int", + "desc": "用户的被关注数" + }, + { + "field_name": "author_id", + "field_type": "int", + "desc": "用户Github的ID" + }, + { + "field_name": "type", + "field_type": "str", + "desc": "账号类型,人类或机器人" + }, + { + "field_name": "login", + "field_type": "str", + "desc": "登录名" + }, + { + "field_name": "created_at", + "field_type": "date", + "desc": "创建时间" + }, + { + "field_name": "updated_at", + "field_type": "date", + "desc": "更新时间" + } + ] + }, + "repo_github_info": { + "file": "../data/Github-Repos.xlsx", + "sheet_name": "汇总", + "desc": "开源项目在Github上的项目交互数据", + "fields": [ + { + "filed_name": "ID", + "field_type": "int", + "desc": "编号" + }, + { + "field_name": "RepoID", + "field_type": "int", + "desc": "开源项目GithubID" + }, + { + "field_name": "Org", + "field_type": "str", + "desc": "开源项目组织" + }, + { + "field_name": "FullName", + "field_type": "str", + "desc": "开源项目全名" + }, + { + "field_name": "Url", + "field_type": "str", + "desc": "开源项目url" + }, + { + "field_name": "Repo", + "field_type": "str", + "desc": "仓库名" + }, + { + "field_name": "Star", + "desc": "开源项目的Github Star 总数", + "field_type": "str" + }, + { + "field_name": "Fork", + "desc": "开源项目的Github Fork 总数", + "field_type": "str" + }, + { + "field_name": "Contributors", + "desc": "开源项目的Github 贡献者 总数", + "field_type": "str" + }, + { + "field_name": "OSC-URL", + "desc": "开源项目的Github CodeChina URL", + "field_type": "str" + }, + { + "field_name": "Region", + "desc": "开源项目所属的区域", + "field_type": "str" + } + ] + }, + "repo_csdn_trends": { + "file": "../data/CSDN/repo-csdn-trends.xlsx", + "sheet_name": "Sheet1", + "desc": "开源项目在CSDN站内指数数据", + "fields": [ + { + "field_name": "repo_name", + "field_type": "str", + "desc": "开源项目名称" + }, + { + "field_name": "2020-10-01 00:00:00", + "field_type": "int", + "desc": "月份第一天时间" + } + ] + }, + "repo_commit_rank": { + "file": "../data/CSDN/repo-commit-rank.csv", + "desc": "开源项目在Github的月commit变化", + "fields": [ + { + "field_name": "actor_email", + "field_type": "str", + "desc": "用户邮箱" + }, + { + "field_name": "sum_total", + "field_type": "int", + "desc": "用户累计Github项目贡献数" + }, + { + "feild_name": "any_repo_path", + "field_type": "str", + "desc": "用户贡献过的任意一个Github仓库路径" + }, + { + "field_name": "any_commit_id", + "field_type": "str", + "desc": "用户在上述贡献过的Github仓库里的任意一个commit" + } + ] + }, + "repo_github_active_trends": { + "file": "../data/PingCAP/项目活跃度变化.csv", + "desc": "开源项目在Github上的月活跃度数据", + "fields": [ + { + "filed_name": "event_month", + "field_type": "date", + "desc": "交互数据汇总月份" + }, + { + "field_name": "repo_name", + "field_type": "str", + "desc": "开源项目名称" + }, + { + "field_name": "push_count", + "field_type": "int", + "desc": "开源项目当月 git push 次数" + }, + { + "field_name": "pr_count", + "field_type": "int", + "desc": "开源项目当月 git pr 次数" + }, + { + "field_name": "issue_count", + "field_type": "int", + "desc": "开源项目当月 git issue 个数" + }, + { + "field_name": "creator_count", + "field_type": "int", + "desc": "开源项目当月 git操作 创建者总数" + } + ] + }, + "repo_github_popular_trends": { + "file": "../data/PingCAP/项目受欢迎度变化.csv", + "desc": "开源项目在Github上的月收欢迎程度数据", + "fields": [ + { + "filed_name": "event_month", + "field_type": "date", + "desc": "交互数据汇总月份" + }, + { + "field_name": "repo_name", + "field_type": "str", + "desc": "开源项目名称" + }, + { + "field_name": "push_count", + "field_type": "int", + "desc": "开源项目当月 git push 次数" + }, + { + "field_name": "pr_count", + "field_type": "int", + "desc": "开源项目当月 git pr 次数" + }, + { + "field_name": "issue_count", + "field_type": "int", + "desc": "开源项目当月 git issue 个数" + }, + { + "field_name": "creator_count", + "field_type": "int", + "desc": "开源项目当月 git操作 创建者总数" + } + ] + } + }, +} diff --git a/src/main.py b/src/main.py index c09911921d2fc7a7c51eadd1b1de885d99c87286..1959dd3493a27a164faa7f8730a4ef76f4078b0b 100644 --- a/src/main.py +++ b/src/main.py @@ -1,455 +1,44 @@ # -*- coding: utf-8 -*- -import os -import numpy as np -import pandas as pd - -config = { - "ranks": { - "personal": { - "top_n": "../ranks/个人向主要开源技术栈贡献榜单.csv", - "top_n_en": "../ranks/个人向国际主要开源技术栈贡献榜单.csv", - "top_n_zh_cn": "../ranks/个人向中国主要开源技术栈贡献榜单.csv", - }, - "company": { - "top_n": "../ranks/公司向主要开源技术栈贡献榜单.csv", - "top_n_en": "../ranks/公司向国际主要开源技术栈贡献榜单.csv", - "top_n_zh_cn": "../ranks/公司向中国主要开源技术栈贡献榜单.csv", - }, - "repo": { - "top_n": "../ranks/开源项目榜.csv", - "top_n_en": "../ranks/开源项目榜_非中国项目.csv", - "top_n_zh_cn": "../ranks/开源项目榜_中国项目.csv" - } - }, - "schema": { - "repo_github_user_info": { - "file": "../data/GitHub/Userinfo.xlsx", - "sheet_name": "repo", - "desc": "开源项目Github贡献者信息", - "fields": [ - { - "field_name": "actor_email", - "field_type": "str", - "desc": "用户邮箱" - }, - { - "field_name": "sum_total", - "field_type": "int", - "desc": "用户累计Github项目贡献数" - }, - { - "feild_name": "any_repo_path", - "field_type": "str", - "desc": "用户贡献过的任意一个Github仓库路径" - }, - { - "field_name": "any_commit_id", - "field_type": "str", - "desc": "用户在上述贡献过的Github仓库里的任意一个commit" - }, - { - "field_name": "avatar_url", - "field_type": "str", - "desc": "用户头像" - }, - { - "field_name": "name", - "field_type": "str", - "desc": "用户昵称" - }, - { - "field_name": "company", - "field_type": "用户在Github上填写的公司名称", - "desc": "" - }, - { - "field_name": "location", - "field_type": "str", - "desc": "用户在Github上填写的城市信息" - }, - { - "field_name": "followers", - "field_type": "int", - "desc": "用户的被关注数" - }, - { - "field_name": "author_id", - "field_type": "int", - "desc": "用户Github的ID" - }, - { - "field_name": "type", - "field_type": "str", - "desc": "账号类型,人类或机器人" - }, - { - "field_name": "login", - "field_type": "str", - "desc": "登录名" - }, - { - "field_name": "created_at", - "field_type": "date", - "desc": "创建时间" - }, - { - "field_name": "updated_at", - "field_type": "date", - "desc": "更新时间" - } - ] - }, - "repo_github_info": { - "file": "../data/Github-Repos.xlsx", - "sheet_name": "汇总", - "desc": "开源项目在Github上的项目交互数据", - "fields": [ - { - "filed_name": "ID", - "field_type": "int", - "desc": "编号" - }, - { - "field_name": "RepoID", - "field_type": "int", - "desc": "开源项目GithubID" - }, - { - "field_name": "Org", - "field_type": "str", - "desc": "开源项目组织" - }, - { - "field_name": "FullName", - "field_type": "str", - "desc": "开源项目全名" - }, - { - "field_name": "Url", - "field_type": "str", - "desc": "开源项目url" - }, - { - "field_name": "Repo", - "field_type": "str", - "desc": "仓库名" - }, - { - "field_name": "Star", - "desc": "开源项目的Github Star 总数", - "field_type": "str" - }, - { - "field_name": "Fork", - "desc": "开源项目的Github Fork 总数", - "field_type": "str" - }, - { - "field_name": "Contributors", - "desc": "开源项目的Github 贡献者 总数", - "field_type": "str" - }, - { - "field_name": "OSC-URL", - "desc": "开源项目的Github CodeChina URL", - "field_type": "str" - }, - { - "field_name": "Region", - "desc": "开源项目所属的区域", - "field_type": "str" - } - ] - }, - "repo_csdn_trends": { - "file": "../data/CSDN/repo-csdn-trends.xlsx", - "sheet_name": "Sheet1", - "desc": "开源项目在CSDN站内指数数据", - "fields": [ - { - "field_name": "repo_name", - "field_type": "str", - "desc": "开源项目名称" - }, - { - "field_name": "2020-10-01 00:00:00", - "field_type": "int", - "desc": "月份第一天时间" - } - ] - }, - "repo_commit_rank": { - "file": "../data/CSDN/repo-commit-rank.csv", - "desc": "开源项目在Github的月commit变化", - "fields": [ - { - "field_name": "actor_email", - "field_type": "str", - "desc": "用户邮箱" - }, - { - "field_name": "sum_total", - "field_type": "int", - "desc": "用户累计Github项目贡献数" - }, - { - "feild_name": "any_repo_path", - "field_type": "str", - "desc": "用户贡献过的任意一个Github仓库路径" - }, - { - "field_name": "any_commit_id", - "field_type": "str", - "desc": "用户在上述贡献过的Github仓库里的任意一个commit" - } - ] - }, - "repo_github_active_trends": { - "file": "../data/PingCAP/项目活跃度变化.csv", - "desc": "开源项目在Github上的月活跃度数据", - "fields": [ - { - "filed_name": "event_month", - "field_type": "date", - "desc": "交互数据汇总月份" - }, - { - "field_name": "repo_name", - "field_type": "str", - "desc": "开源项目名称" - }, - { - "field_name": "push_count", - "field_type": "int", - "desc": "开源项目当月 git push 次数" - }, - { - "field_name": "pr_count", - "field_type": "int", - "desc": "开源项目当月 git pr 次数" - }, - { - "field_name": "issue_count", - "field_type": "int", - "desc": "开源项目当月 git issue 个数" - }, - { - "field_name": "creator_count", - "field_type": "int", - "desc": "开源项目当月 git操作 创建者总数" - } - ] - }, - "repo_github_popular_trends": { - "file": "../data/PingCAP/项目受欢迎度变化.csv", - "desc": "开源项目在Github上的月收欢迎程度数据", - "fields": [ - { - "filed_name": "event_month", - "field_type": "date", - "desc": "交互数据汇总月份" - }, - { - "field_name": "repo_name", - "field_type": "str", - "desc": "开源项目名称" - }, - { - "field_name": "push_count", - "field_type": "int", - "desc": "开源项目当月 git push 次数" - }, - { - "field_name": "pr_count", - "field_type": "int", - "desc": "开源项目当月 git pr 次数" - }, - { - "field_name": "issue_count", - "field_type": "int", - "desc": "开源项目当月 git issue 个数" - }, - { - "field_name": "creator_count", - "field_type": "int", - "desc": "开源项目当月 git操作 创建者总数" - } - ] +from common.cli import dispatch +from config import config +from tasks.ranks import calc_ranks +from optparse import OptionParser + + +def parse_options(): + parser = OptionParser() + parser.add_option( + "-a", "--action", + dest="action", + help="action", + metavar="ACTION" + ) + (options, args) = parser.parse_args() + return [options, args] + + +def show_help(): + print("请指定任务类型,用例:") + print("python main.py -a tasks.ranks") + + +def main(config, options, actions): + # 配置任务路由 + dispatch(actions, { + # 请在此添加其他任务路由 + "tasks": { + "ranks": lambda: calc_ranks(config, options) } - }, -} - - -def load_repo_github_user_info(config, ctx): - schema = config["schema"] - df = pd.read_excel( - schema["repo_github_user_info"]["file"], - sheet_name=schema["repo_github_user_info"]["sheet_name"]) - df.fillna(value=0, inplace=True) - ctx["repo_github_user_info"] = df - - -def load_repo_github_info(config, ctx): - schema = config["schema"] - df = pd.read_excel( - schema["repo_github_info"]["file"], - sheet_name=schema["repo_github_info"]["sheet_name"]) - df.fillna(value=0, inplace=True) - ctx["repo_github_info"] = df - - -def load_repo_csdn_trends(config, ctx): - schema = config["schema"] - df = pd.read_excel( - schema["repo_csdn_trends"]["file"], - sheet_name=schema["repo_csdn_trends"]["sheet_name"]) - df.fillna(value=0, inplace=True) - ctx["repo_csdn_trends"] = df - - -def load_repo_commit_rank(config, ctx): - schema = config["schema"] - df = pd.read_csv(schema["repo_commit_rank"]["file"]) - df.fillna(value=0, inplace=True) - ctx["repo_commit_rank"] = df - - -def load_repo_github_active_trends(config, ctx): - schema = config["schema"] - df = pd.read_csv(schema["repo_github_active_trends"]["file"]) - df.fillna(value=0, inplace=True) - ctx["repo_github_active_trends"] = df - - -def load_repo_github_popular_trends(config, ctx): - schema = config["schema"] - df = pd.read_csv(schema["repo_github_popular_trends"]["file"]) - df.fillna(value=0, inplace=True) - ctx["repo_github_popular_trends"] = df - - -def rank_repo_top_n(config, ctx): - repo_rank = [] - repo_dict = {} - - # 合并项目的总数据 repo_github_info 主键 FullName - df = ctx["repo_github_info"] - df.fillna(value=0) - for index, row in df.iterrows(): - repo_item = {} - repo_key = row["FullName"].lower() - repo_item["region"] = row["Region"] - repo_item["star"] = int(row["Star"]) - repo_item["fork"] = int(row["Fork"]) - repo_item["contributors"] = int(row["Contributors"]) - - repo_dict[repo_key] = repo_item - repo_rank.append(repo_item) - - # 合并项目的CSDN指数数据 repo_csdn_trends 主键 repo_name - df = ctx["repo_csdn_trends"] - for index, row in df.iterrows(): - repo_name = row['repo_name'] - repo_key = repo_name.lower() - repo_item = repo_dict.get(repo_key) - repo_item["csdn_index_month_avg"] = row[1:].mean() - - # 合并项目的活跃变动数据 repo_github_active_trends 主键 repo_name - df = ctx["repo_github_active_trends"] - df.fillna(value=0) - df = df.groupby(["repo_name"]).agg(np.mean) - # print(df.loc['TheAlgorithms/Python']) - for index, row in df.iterrows(): - repo_name = index - repo_key = repo_name.lower() - repo_item = repo_dict.get(repo_key) - repo_item["push_count_month_avg"] = row["push_count"] - repo_item["pr_count_month_avg"] = row["pr_count"] - repo_item["issue_count_month_avg"] = row["issue_count"] - repo_item["creator_count_month_avg"] = row["creator_count"] - - # 合并项目的激活变动数据 repo_github_popular_trends 主键 repo_name - df = ctx["repo_github_popular_trends"] - df = df.groupby(["repo_name"]).agg(np.mean) - for index, row in df.iterrows(): - repo_name = index - repo_key = repo_name.lower() - repo_item = repo_dict.get(repo_key) - repo_item["watch_count_month_avg"] = row["watch_count"] - repo_item["fork_count_month_avg"] = row["fork_count"] - - # 合并表 - df = pd.DataFrame.from_dict(repo_dict, orient='index') - df.fillna(value=0, inplace=True) - df.reset_index() - - # 计算榜单得分 - weights = { - "star": 1, - "fork": 1, - "contributors": 1, - "csdn_index_month_avg": 1, - "push_count_month_avg": 1, - "pr_count_month_avg": 1, - "issue_count_month_avg": 1, - "creator_count_month_avg": 1, - "watch_count_month_avg": 1, - "fork_count_month_avg": 1 - } - total_weight_value = 0 - for key in weights: - total_weight_value += weights[key] - - for key in weights: - weights[key] = weights[key]/total_weight_value - - df["score"] = 0 - for key in weights: - df["score"] += df[key].apply(lambda x: x*weights[key]) - - print(df.head()) - df = df.sort_values(by="score", ascending=False) - df.to_csv(config["ranks"]["repo"]["top_n"]) - - -def rank_personal_top_n(config, ctx): - pass - - -def rank_company_top_n(config, ctx): - pass - - -def main(config): - ctx = {} - - # print("@load_repo_github_user_info..") - # load_repo_github_user_info(config, ctx) - - # print("@load_repo_commit_rank..") - # load_repo_commit_rank(config, ctx) - - print("@load_repo_github_info..") - load_repo_github_info(config, ctx) - - print("@load_repo_csdn_trends..") - load_repo_csdn_trends(config, ctx) - - print("@load_repo_github_active_trends..") - load_repo_github_active_trends(config, ctx) - - print("@load_repo_github_popular_trends..") - load_repo_github_popular_trends(config, ctx) - - print("@rank_repo_top_n..") - rank_repo_top_n(config, ctx) - - # print("@rank_personal_top_n..") - # rank_personal_top_n(config, ctx) - - # print("@rank_company_top_n..") - # rank_company_top_n(config, ctx) + }) if __name__ == "__main__": - main(config) + [options, args] = parse_options() + if options.action is None: + show_help() + else: + actions = options.action.split('.') + if len(actions) == 0: + show_help() + else: + main(config, options, actions) diff --git a/src/github_author_claw.py b/src/tasks/github_author_claw.py similarity index 100% rename from src/github_author_claw.py rename to src/tasks/github_author_claw.py diff --git a/src/tasks/ranks.py b/src/tasks/ranks.py new file mode 100644 index 0000000000000000000000000000000000000000..6c1e818f8a900c08947ee217cdd05a9b77896b92 --- /dev/null +++ b/src/tasks/ranks.py @@ -0,0 +1,176 @@ +# -*- coding: utf-8 -*- +import os +import numpy as np +import pandas as pd +from config import config + + +def load_repo_github_user_info(config, ctx): + schema = config["schema"] + df = pd.read_excel( + schema["repo_github_user_info"]["file"], + sheet_name=schema["repo_github_user_info"]["sheet_name"]) + df.fillna(value=0, inplace=True) + ctx["repo_github_user_info"] = df + + +def load_repo_github_info(config, ctx): + schema = config["schema"] + df = pd.read_excel( + schema["repo_github_info"]["file"], + sheet_name=schema["repo_github_info"]["sheet_name"]) + df.fillna(value=0, inplace=True) + ctx["repo_github_info"] = df + + +def load_repo_csdn_trends(config, ctx): + schema = config["schema"] + df = pd.read_excel( + schema["repo_csdn_trends"]["file"], + sheet_name=schema["repo_csdn_trends"]["sheet_name"]) + df.fillna(value=0, inplace=True) + ctx["repo_csdn_trends"] = df + + +def load_repo_commit_rank(config, ctx): + schema = config["schema"] + df = pd.read_csv(schema["repo_commit_rank"]["file"]) + df.fillna(value=0, inplace=True) + ctx["repo_commit_rank"] = df + + +def load_repo_github_active_trends(config, ctx): + schema = config["schema"] + df = pd.read_csv(schema["repo_github_active_trends"]["file"]) + df.fillna(value=0, inplace=True) + ctx["repo_github_active_trends"] = df + + +def load_repo_github_popular_trends(config, ctx): + schema = config["schema"] + df = pd.read_csv(schema["repo_github_popular_trends"]["file"]) + df.fillna(value=0, inplace=True) + ctx["repo_github_popular_trends"] = df + + +def rank_repo_top_n(config, ctx): + repo_rank = [] + repo_dict = {} + + # 合并项目的总数据 repo_github_info 主键 FullName + df = ctx["repo_github_info"] + df.fillna(value=0) + for index, row in df.iterrows(): + repo_item = {} + repo_key = row["FullName"].lower() + repo_item["region"] = row["Region"] + repo_item["star"] = int(row["Star"]) + repo_item["fork"] = int(row["Fork"]) + repo_item["contributors"] = int(row["Contributors"]) + + repo_dict[repo_key] = repo_item + repo_rank.append(repo_item) + + # 合并项目的CSDN指数数据 repo_csdn_trends 主键 repo_name + df = ctx["repo_csdn_trends"] + for index, row in df.iterrows(): + repo_name = row['repo_name'] + repo_key = repo_name.lower() + repo_item = repo_dict.get(repo_key) + repo_item["csdn_index_month_avg"] = row[1:].mean() + + # 合并项目的活跃变动数据 repo_github_active_trends 主键 repo_name + df = ctx["repo_github_active_trends"] + df.fillna(value=0) + df = df.groupby(["repo_name"]).agg(np.mean) + # print(df.loc['TheAlgorithms/Python']) + for index, row in df.iterrows(): + repo_name = index + repo_key = repo_name.lower() + repo_item = repo_dict.get(repo_key) + repo_item["push_count_month_avg"] = row["push_count"] + repo_item["pr_count_month_avg"] = row["pr_count"] + repo_item["issue_count_month_avg"] = row["issue_count"] + repo_item["creator_count_month_avg"] = row["creator_count"] + + # 合并项目的激活变动数据 repo_github_popular_trends 主键 repo_name + df = ctx["repo_github_popular_trends"] + df = df.groupby(["repo_name"]).agg(np.mean) + for index, row in df.iterrows(): + repo_name = index + repo_key = repo_name.lower() + repo_item = repo_dict.get(repo_key) + repo_item["watch_count_month_avg"] = row["watch_count"] + repo_item["fork_count_month_avg"] = row["fork_count"] + + # 合并表 + df = pd.DataFrame.from_dict(repo_dict, orient='index') + df.fillna(value=0, inplace=True) + df.reset_index() + + # 计算榜单得分 + weights = { + "star": 1, + "fork": 1, + "contributors": 1, + "csdn_index_month_avg": 1, + "push_count_month_avg": 1, + "pr_count_month_avg": 1, + "issue_count_month_avg": 1, + "creator_count_month_avg": 1, + "watch_count_month_avg": 1, + "fork_count_month_avg": 1 + } + total_weight_value = 0 + for key in weights: + total_weight_value += weights[key] + + for key in weights: + weights[key] = weights[key]/total_weight_value + + df["score"] = 0 + for key in weights: + df["score"] += df[key].apply(lambda x: x*weights[key]) + + print(df.head()) + df = df.sort_values(by="score", ascending=False) + df.to_csv(config["ranks"]["repo"]["top_n"]) + + +def rank_personal_top_n(config, ctx): + pass + + +def rank_company_top_n(config, ctx): + pass + + +def calc_ranks(config, options): + ctx = {} + + # print("@load_repo_github_user_info..") + # load_repo_github_user_info(config, ctx) + + # print("@load_repo_commit_rank..") + # load_repo_commit_rank(config, ctx) + + print("@load_repo_github_info..") + load_repo_github_info(config, ctx) + + print("@load_repo_csdn_trends..") + load_repo_csdn_trends(config, ctx) + + print("@load_repo_github_active_trends..") + load_repo_github_active_trends(config, ctx) + + print("@load_repo_github_popular_trends..") + load_repo_github_popular_trends(config, ctx) + + print("@rank_repo_top_n..") + rank_repo_top_n(config, ctx) + + # print("@rank_personal_top_n..") + # rank_personal_top_n(config, ctx) + + # print("@rank_company_top_n..") + # rank_company_top_n(config, ctx) diff --git a/src/repoinfo.py b/src/tasks/repoinfo.py similarity index 100% rename from src/repoinfo.py rename to src/tasks/repoinfo.py