# -*- coding: utf-8 -*- import numpy as np import pandas as pd def load_repo_github_info(config, ctx): schema = config["schema"] df = pd.read_excel( schema["repo_github_info"]["file"], sheet_name=schema["repo_github_info"]["sheet_name"]) df.fillna(value=0, inplace=True) ctx["repo_github_info"] = df def load_repo_csdn_trends(config, ctx): schema = config["schema"] df = pd.read_excel( schema["repo_csdn_trends"]["file"], sheet_name=schema["repo_csdn_trends"]["sheet_name"]) df.fillna(value=0, inplace=True) ctx["repo_csdn_trends"] = df def load_repo_commit_rank(config, ctx): schema = config["schema"] df = pd.read_csv(schema["repo_commit_rank"]["file"]) df.fillna(value=0, inplace=True) ctx["repo_commit_rank"] = df def load_repo_github_active_trends(config, ctx): schema = config["schema"] df = pd.read_csv(schema["repo_github_active_trends"]["file"]) df.fillna(value=0, inplace=True) ctx["repo_github_active_trends"] = df def load_repo_github_popular_trends(config, ctx): schema = config["schema"] df = pd.read_csv(schema["repo_github_popular_trends"]["file"]) df.fillna(value=0, inplace=True) ctx["repo_github_popular_trends"] = df def load_repo_github_user_info(config, ctx): # bar = None # def show_pd_read_excel_progress(cur, tt): # global bar # if bar is None: # bar = IncrementalBar("@开始加载用户仓库贡献表:", max=tt) # else: # bar.next(1) schema = config["schema"] df = pd.read_excel( schema["repo_github_user_info"]["file"], sheet_name=schema["repo_github_user_info"]["sheet_name"]) df.fillna(value=0, inplace=True) ctx["repo_github_user_info"] = df def load_repo_github_user_commit_info(config, ctx): schema = config["schema"] df = pd.read_csv( schema["repo_github_user_commit_info"]["file"]) df.fillna(value=0, inplace=True) ctx["repo_github_user_commit_info"] = df def load_district_zh_cn(config, ctx): schema = config["schema"] names = list(map(lambda r: r["field_name"], schema["district_zh_cn"]["fields"])) print(names) df = pd.read_csv( schema["district_zh_cn"]["file"], header=None, names=names, index_col=False, sep='\t') df.fillna(value=0, inplace=True) ctx["district_zh_cn"] = df print(df.head()) def rank_repo_top_n(config, ctx): repo_rank = [] repo_dict = {} # 合并项目的总数据 repo_github_info 主键 FullName df = ctx["repo_github_info"] df.fillna(value=0) for index, row in df.iterrows(): repo_item = {} repo_key = row["FullName"].lower() repo_item["region"] = row["Region"] repo_item["star"] = int(row["Star"]) repo_item["fork"] = int(row["Fork"]) repo_item["contributors"] = int(row["Contributors"]) repo_dict[repo_key] = repo_item repo_rank.append(repo_item) # 合并项目的CSDN指数数据 repo_csdn_trends 主键 repo_name df = ctx["repo_csdn_trends"] for index, row in df.iterrows(): repo_name = row['repo_name'] repo_key = repo_name.lower() repo_item = repo_dict.get(repo_key) repo_item["csdn_index_month_avg"] = row[1:].mean() # 合并项目的活跃变动数据 repo_github_active_trends 主键 repo_name df = ctx["repo_github_active_trends"] df.fillna(value=0) df = df.groupby(["repo_name"]).agg(np.mean) # print(df.loc['TheAlgorithms/Python']) for index, row in df.iterrows(): repo_name = index repo_key = repo_name.lower() repo_item = repo_dict.get(repo_key) repo_item["push_count_month_avg"] = row["push_count"] repo_item["pr_count_month_avg"] = row["pr_count"] repo_item["issue_count_month_avg"] = row["issue_count"] repo_item["creator_count_month_avg"] = row["creator_count"] # 合并项目的激活变动数据 repo_github_popular_trends 主键 repo_name df = ctx["repo_github_popular_trends"] df = df.groupby(["repo_name"]).agg(np.mean) for index, row in df.iterrows(): repo_name = index repo_key = repo_name.lower() repo_item = repo_dict.get(repo_key) repo_item["watch_count_month_avg"] = row["watch_count"] repo_item["fork_count_month_avg"] = row["fork_count"] # 合并表 df = pd.DataFrame.from_dict(repo_dict, orient='index') df.fillna(value=0, inplace=True) df.reset_index() # 计算榜单得分 weights = { "star": 1, "fork": 1, "contributors": 1, "csdn_index_month_avg": 1, "push_count_month_avg": 1, "pr_count_month_avg": 1, "issue_count_month_avg": 1, "creator_count_month_avg": 1, "watch_count_month_avg": 1, "fork_count_month_avg": 1 } total_weight_value = 0 for key in weights: total_weight_value += weights[key] for key in weights: weights[key] = weights[key]/total_weight_value df["score"] = 0 for key in weights: df["score"] += df[key].apply(lambda x: x*weights[key]) print(df.head()) df.sort_values(by="score", ascending=False, inplace=True) df.to_csv(config["ranks"]["repo"]["top_n"]) for index, row in df.iterrows(): repo_name = index repo_item = repo_dict[repo_name] repo_item["score"] = row["score"] ctx["ranks"]["repo_top_n"] = df ctx["ranks"]["repo_top_n_dict"] = repo_dict def rank_personal_top_n(config, ctx): repo_top_n_dict = ctx["ranks"]["repo_top_n_dict"] district_zh_cn = ctx["district_zh_cn"] district_pinyin_dict = { "china": True, } for index, row in district_zh_cn.iterrows(): pinyin = row["pinyin"] if pinyin is not None and pinyin != "": district_pinyin_dict[pinyin.lower()] = True # 合并 repo_github_user_info 表,主键是 actor_email personal_dict = {} df = ctx["repo_github_user_info"] for index, row in df.iterrows(): email = row["actor_email"] # 忽略错误数据 if type(email) != type(""): continue person_key = email.lower() person = {} personal_dict[person_key] = person # 用户基本信息 person['actor_email'] = row['actor_email'] email_parts = person['actor_email'].split("@") if len(email_parts) > 1 and email_parts[1] != "": post_fix = email_parts[len(email_parts)-1] post_parts = post_fix.replace("..", ".").split(".") company_by_email = ".".join(post_parts[-2:]) person['company_by_email'] = company_by_email else: person['company_by_email'] = "未知公司,邮箱错误" person['avatar_url'] = row['avatar_url'] person['name'] = row['name'] person['company'] = row['company'] person['location'] = row['location'] person['followers'] = row['followers'] person['author_id'] = row['author_id'] person['type'] = row['type'] person['login'] = row['login'] location = person['location'] if location is not None and type(location) == type(""): # results = process.extract( # location, district_pinyin_list, limit=1, scorer=fuzz.token_sort_ratio) # print(results, location) parts = location.split(",") for part in parts: if district_pinyin_dict.get(part.strip().lower()) is not None: person["country"] = "china" if person.get("country") is None: person["country"] = "international" # 用户统计数据 person['all_repo_contribute_total'] = int(row['sum_total']) # 合并 repo_github_user_info 仓库排行表记录 df = ctx["repo_github_user_commit_info"] for index, row in df.iterrows(): email = row["actor_email"] # 忽略错误数据 if type(email) != type(""): continue total = row["total"] repo_name = row["repo_name"] person_key = email.lower() repo_key = repo_name.lower() repo_item = repo_top_n_dict.get(repo_key) if repo_item is not None: repo_user_contribute_list = repo_item.get('user_contribute_list') if repo_user_contribute_list is None: repo_user_contribute_list = [] repo_item['user_contribute_list'] = repo_user_contribute_list # 项目的用户贡献者列表增加 repo_user_contribute_list.append({ "actor_email": email, "total": total, "repo_score": repo_item['score'] # 项目得分 }) # 项目内的用户贡献排序 for repo_key in repo_top_n_dict: repo_item = repo_top_n_dict[repo_key] repo_user_contribute_list = repo_item.get('user_contribute_list') if repo_user_contribute_list is not None: repo_user_contribute_list.sort( key=lambda x: x["total"], reverse=True) else: print(f"[warn] missing contributors repo:{repo_key}") # 现在,合并每个项目的用户贡献到一个总表 personal_score_dict = {} personal_score_dict_en = {} personal_score_dict_zh_cn = {} def copy_person_item(person_item, person_info): person_item['actor_email'] = person_info['actor_email'] person_item['company_by_email'] = person_info["company_by_email"] person_item['avatar_url'] = person_info['avatar_url'] person_item['name'] = person_info['name'] person_item['company'] = person_info['company'] person_item['location'] = person_info['location'] person_item['followers'] = person_info['followers'] person_item['author_id'] = person_info['author_id'] person_item['type'] = person_info['type'] person_item['login'] = person_info['login'] person_item['country'] = person_info['country'] for repo_key in repo_top_n_dict: repo_item = repo_top_n_dict[repo_key] repo_user_contribute_list = repo_item.get('user_contribute_list') if repo_user_contribute_list is None: continue repo_region = repo_item['region'] for repo_person in repo_user_contribute_list: email = repo_person['actor_email'] person_key = email.lower() # all person_item = personal_score_dict.get(person_key) if person_item is None: person_item = { "score": repo_person["total"]*repo_person["repo_score"], } personal_score_dict[person_key] = person_item person_info = personal_dict.get(person_key) # 忽略不匹配数据 if person_info is None: continue copy_person_item(person_item, person_info) else: person_item['score'] += repo_person["total"] * \ repo_person["repo_score"] # en if repo_region != "国产": person_item = personal_score_dict_en.get(person_key) if person_item is None: person_item = { "score": repo_person["total"]*repo_person["repo_score"], } personal_score_dict_en[person_key] = person_item person_info = personal_dict.get(person_key) # 忽略不匹配数据 if person_info is None: continue copy_person_item(person_item, person_info) else: person_item['score'] += repo_person["total"] * \ repo_person["repo_score"] else: # zh_cn person_item = personal_score_dict_zh_cn.get(person_key) if person_item is None: person_item = { "score": repo_person["total"]*repo_person["repo_score"], } personal_score_dict_zh_cn[person_key] = person_item person_info = personal_dict.get(person_key) # 忽略不匹配数据 if person_info is None: continue copy_person_item(person_item, person_info) else: person_item['score'] += repo_person["total"] * \ repo_person["repo_score"] # 个人向全部技术项目贡献排行榜 df = pd.DataFrame.from_dict(personal_score_dict, orient='index') df.sort_values(by="score", ascending=False, inplace=True) df.to_csv(config["ranks"]["personal"]["top_n"]) ctx["ranks"]["personal_top_n"] = df ctx["ranks"]["personal_top_n_dict"] = personal_score_dict print(df.head()) # 个人向国际技术项目贡献排行榜 df = pd.DataFrame.from_dict(personal_score_dict_en, orient='index') df.sort_values(by="score", ascending=False, inplace=True) df.to_csv(config["ranks"]["personal"]["top_n_en"]) ctx["ranks"]["personal_top_n_en"] = df ctx["ranks"]["personal_top_n_en_dict"] = personal_score_dict_en print(df.head()) # 个人向中国技术项目贡献排行榜 df = pd.DataFrame.from_dict(personal_score_dict_zh_cn, orient='index') df.sort_values(by="score", ascending=False, inplace=True) df.to_csv(config["ranks"]["personal"]["top_n_zh_cn"]) ctx["ranks"]["personal_top_n_zh_cn"] = df ctx["ranks"]["personal_top_n_zh_cn_dict"] = personal_score_dict_zh_cn print(df.head()) def rank_company_top_n(config, ctx): # 公司向全部技术项目排行榜 personal_top_n = ctx["ranks"]["personal_top_n"] company_top_n = personal_top_n.groupby( ["company_by_email"]).sum() company_top_n.sort_values(by="score", ascending=False, inplace=True) company_top_n.to_csv(config["ranks"]["company"]["top_n"]) ctx["ranks"]["company_top_n"] = company_top_n print(company_top_n.head()) # 公司向国际技术项目排行榜 personal_top_n_en = ctx["ranks"]["personal_top_n"] company_top_n_en = personal_top_n_en.groupby( ["company_by_email"]).sum().reset_index() company_top_n_en.sort_values(by="score", ascending=False, inplace=True) company_top_n_en.to_csv(config["ranks"]["company"]["top_n_en"]) ctx["ranks"]["company_top_n_en"] = company_top_n_en print(company_top_n_en.head()) # 公司向中国技术项目排行榜 personal_top_n_zh_cn = ctx["ranks"]["personal_top_n_zh_cn"] company_top_n_zh_cn = personal_top_n_zh_cn.groupby( ["company_by_email"]).sum().reset_index() company_top_n_zh_cn.sort_values(by="score", ascending=False, inplace=True) company_top_n_zh_cn.to_csv(config["ranks"]["company"]["top_n_zh_cn"]) ctx["ranks"]["company_top_n_zh_cn"] = company_top_n_zh_cn print(company_top_n_zh_cn.head()) def calc_repo_rank(config, options, ctx): print("@load_repo_github_info..") load_repo_github_info(config, ctx) print("@load_repo_csdn_trends..") load_repo_csdn_trends(config, ctx) print("@load_repo_github_active_trends..") load_repo_github_active_trends(config, ctx) print("@load_repo_github_popular_trends..") load_repo_github_popular_trends(config, ctx) print("@rank_repo_top_n..") rank_repo_top_n(config, ctx) def calc_personal_rank(config, options, ctx): print("@load_district_zh_cn..") load_district_zh_cn(config, ctx) print("@load_repo_github_user_info..") load_repo_github_user_info(config, ctx) print("@load_repo_github_user_commit_info..") load_repo_github_user_commit_info(config, ctx) print("@rank_personal_top_n..") rank_personal_top_n(config, ctx) def calc_company_rank(config, options, ctx): rank_company_top_n(config, ctx) def calc_ranks(config, options): ctx = { "ranks": {} } calc_repo_rank(config, options, ctx) calc_personal_rank(config, options, ctx) calc_company_rank(config, options, ctx)