“745aacfc38cbe5a63f768a4bea0f949bb07769b3”上不存在“python/git@gitcode.net:s920243400/PaddleDetection.git”
提交 00e4610a 编写于 作者: F feilong

add company rank

上级 89cef470
此差异已折叠。
此差异已折叠。
...@@ -26,6 +26,67 @@ config = { ...@@ -26,6 +26,67 @@ config = {
} }
}, },
"schema": { "schema": {
"district_zh_cn": {
"file": "../data/district/district-full.csv",
"desc": "中国省市汉语拼音表",
"fields": [
{
"field_name": "id",
"field_type": "int",
"desc": "编号"
},
{
"field_name": "name",
"field_type": "str",
"desc": "名字"
},
{
"field_name": "parent_id",
"field_type": "int",
"desc": "父级id"
},
{
"field_name": "initial",
"field_type": "char",
"desc": "拼音首字符"
},
{
"field_name": "initials",
"field_type": "str",
"desc": "拼音前缀"
},
{
"field_name": "pinyin",
"field_type": "str",
"desc": "拼音"
},
{
"field_name": "extra",
"field_type": "str",
"desc": "额外信息"
},
{
"field_name": "suffix",
"field_type": "str",
"desc": "行政区划"
},
{
"field_name": "code",
"field_type": "str",
"desc": "邮政编码"
},
{
"field_name": "area_code",
"field_type": "str",
"desc": "区域码"
},
{
"field_name": "order",
"field_type": "int",
"desc": "排序"
},
]
},
"repo_github_user_info": { "repo_github_user_info": {
"file": "../data/GitHub/Userinfo.xlsx", "file": "../data/GitHub/Userinfo.xlsx",
"sheet_name": "repo", "sheet_name": "repo",
......
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
from audioop import reverse from turtle import pos
import os
import numpy as np import numpy as np
import pandas as pd import pandas as pd
...@@ -71,6 +70,18 @@ def load_repo_github_user_commit_info(config, ctx): ...@@ -71,6 +70,18 @@ def load_repo_github_user_commit_info(config, ctx):
ctx["repo_github_user_commit_info"] = df ctx["repo_github_user_commit_info"] = df
def load_district_zh_cn(config, ctx):
schema = config["schema"]
names = list(map(lambda r: r["field_name"],
schema["district_zh_cn"]["fields"]))
print(names)
df = pd.read_csv(
schema["district_zh_cn"]["file"], header=None, names=names, index_col=False, sep='\t')
df.fillna(value=0, inplace=True)
ctx["district_zh_cn"] = df
print(df.head())
def rank_repo_top_n(config, ctx): def rank_repo_top_n(config, ctx):
repo_rank = [] repo_rank = []
repo_dict = {} repo_dict = {}
...@@ -166,6 +177,15 @@ def rank_repo_top_n(config, ctx): ...@@ -166,6 +177,15 @@ def rank_repo_top_n(config, ctx):
def rank_personal_top_n(config, ctx): def rank_personal_top_n(config, ctx):
repo_top_n_dict = ctx["ranks"]["repo_top_n_dict"] repo_top_n_dict = ctx["ranks"]["repo_top_n_dict"]
district_zh_cn = ctx["district_zh_cn"]
district_pinyin_dict = {
"china": True,
}
for index, row in district_zh_cn.iterrows():
pinyin = row["pinyin"]
if pinyin is not None and pinyin != "":
district_pinyin_dict[pinyin.lower()] = True
# 合并 repo_github_user_info 表,主键是 actor_email # 合并 repo_github_user_info 表,主键是 actor_email
personal_dict = {} personal_dict = {}
df = ctx["repo_github_user_info"] df = ctx["repo_github_user_info"]
...@@ -182,6 +202,17 @@ def rank_personal_top_n(config, ctx): ...@@ -182,6 +202,17 @@ def rank_personal_top_n(config, ctx):
personal_dict[person_key] = person personal_dict[person_key] = person
# 用户基本信息 # 用户基本信息
person['actor_email'] = row['actor_email']
email_parts = person['actor_email'].split("@")
if len(email_parts) > 1 and email_parts[1] != "":
post_fix = email_parts[len(email_parts)-1]
post_parts = post_fix.replace("..", ".").split(".")
company_by_email = ".".join(post_parts[-2:])
person['company_by_email'] = company_by_email
else:
person['company_by_email'] = "未知公司,邮箱错误"
person['avatar_url'] = row['avatar_url'] person['avatar_url'] = row['avatar_url']
person['name'] = row['name'] person['name'] = row['name']
person['company'] = row['company'] person['company'] = row['company']
...@@ -191,7 +222,19 @@ def rank_personal_top_n(config, ctx): ...@@ -191,7 +222,19 @@ def rank_personal_top_n(config, ctx):
person['type'] = row['type'] person['type'] = row['type']
person['login'] = row['login'] person['login'] = row['login']
# 用户统计数据 location = person['location']
if location is not None and type(location) == type(""):
# results = process.extract(
# location, district_pinyin_list, limit=1, scorer=fuzz.token_sort_ratio)
# print(results, location)
parts = location.split(",")
for part in parts:
if district_pinyin_dict.get(part.strip().lower()) is not None:
person["country"] = "china"
if person.get("country") is None:
person["country"] = "international"
# 用户统计数据
person['all_repo_contribute_total'] = int(row['sum_total']) person['all_repo_contribute_total'] = int(row['sum_total'])
# 合并 repo_github_user_info 仓库排行表记录 # 合并 repo_github_user_info 仓库排行表记录
...@@ -234,14 +277,35 @@ def rank_personal_top_n(config, ctx): ...@@ -234,14 +277,35 @@ def rank_personal_top_n(config, ctx):
# 现在,合并每个项目的用户贡献到一个总表 # 现在,合并每个项目的用户贡献到一个总表
personal_score_dict = {} personal_score_dict = {}
personal_score_dict_en = {}
personal_score_dict_zh_cn = {}
def copy_person_item(person_item, person_info):
person_item['actor_email'] = person_info['actor_email']
person_item['company_by_email'] = person_info["company_by_email"]
person_item['avatar_url'] = person_info['avatar_url']
person_item['name'] = person_info['name']
person_item['company'] = person_info['company']
person_item['location'] = person_info['location']
person_item['followers'] = person_info['followers']
person_item['author_id'] = person_info['author_id']
person_item['type'] = person_info['type']
person_item['login'] = person_info['login']
person_item['country'] = person_info['country']
for repo_key in repo_top_n_dict: for repo_key in repo_top_n_dict:
repo_item = repo_top_n_dict[repo_key] repo_item = repo_top_n_dict[repo_key]
repo_user_contribute_list = repo_item.get('user_contribute_list') repo_user_contribute_list = repo_item.get('user_contribute_list')
if repo_user_contribute_list is None: if repo_user_contribute_list is None:
continue continue
repo_region = repo_item['region']
for repo_person in repo_user_contribute_list: for repo_person in repo_user_contribute_list:
email = repo_person['actor_email'] email = repo_person['actor_email']
person_key = email.lower() person_key = email.lower()
# all
person_item = personal_score_dict.get(person_key) person_item = personal_score_dict.get(person_key)
if person_item is None: if person_item is None:
person_item = { person_item = {
...@@ -255,27 +319,102 @@ def rank_personal_top_n(config, ctx): ...@@ -255,27 +319,102 @@ def rank_personal_top_n(config, ctx):
if person_info is None: if person_info is None:
continue continue
person_item['avatar_url'] = person_info['avatar_url'] copy_person_item(person_item, person_info)
person_item['name'] = person_info['name']
person_item['company'] = person_info['company']
person_item['location'] = person_info['location']
person_item['followers'] = person_info['followers']
person_item['author_id'] = person_info['author_id']
person_item['type'] = person_info['type']
person_item['login'] = person_info['login']
else: else:
person_item['score'] += repo_person["total"] * \ person_item['score'] += repo_person["total"] * \
repo_person["repo_score"] repo_person["repo_score"]
# 排行 # en
if repo_region != "国产":
person_item = personal_score_dict_en.get(person_key)
if person_item is None:
person_item = {
"score": repo_person["total"]*repo_person["repo_score"],
}
personal_score_dict_en[person_key] = person_item
person_info = personal_dict.get(person_key)
# 忽略不匹配数据
if person_info is None:
continue
copy_person_item(person_item, person_info)
else:
person_item['score'] += repo_person["total"] * \
repo_person["repo_score"]
else:
# zh_cn
person_item = personal_score_dict_zh_cn.get(person_key)
if person_item is None:
person_item = {
"score": repo_person["total"]*repo_person["repo_score"],
}
personal_score_dict_zh_cn[person_key] = person_item
person_info = personal_dict.get(person_key)
# 忽略不匹配数据
if person_info is None:
continue
copy_person_item(person_item, person_info)
else:
person_item['score'] += repo_person["total"] * \
repo_person["repo_score"]
# 个人向全部技术项目贡献排行榜
df = pd.DataFrame.from_dict(personal_score_dict, orient='index') df = pd.DataFrame.from_dict(personal_score_dict, orient='index')
df.sort_values(by="score", ascending=False) df.sort_values(by="score", ascending=False)
df.to_csv(config["ranks"]["personal"]["top_n"]) df.to_csv(config["ranks"]["personal"]["top_n"])
ctx["ranks"]["personal_top_n"] = df
ctx["ranks"]["personal_top_n_dict"] = personal_score_dict
print(df.head())
# 个人向国际技术项目贡献排行榜
df = pd.DataFrame.from_dict(personal_score_dict_en, orient='index')
df.sort_values(by="score", ascending=False)
df.to_csv(config["ranks"]["personal"]["top_n_en"])
ctx["ranks"]["personal_top_n_en"] = df
ctx["ranks"]["personal_top_n_en_dict"] = personal_score_dict_en
print(df.head())
# 个人向中国技术项目贡献排行榜
df = pd.DataFrame.from_dict(personal_score_dict_zh_cn, orient='index')
df.sort_values(by="score", ascending=False)
df.to_csv(config["ranks"]["personal"]["top_n_zh_cn"])
ctx["ranks"]["personal_top_n_zh_cn"] = df
ctx["ranks"]["personal_top_n_zh_cn_dict"] = personal_score_dict_zh_cn
print(df.head()) print(df.head())
def rank_company_top_n(config, ctx): def rank_company_top_n(config, ctx):
pass # 公司向全部技术项目排行榜
personal_top_n = ctx["ranks"]["personal_top_n"]
company_top_n = personal_top_n.groupby(
["company_by_email"]).sum().reset_index()
company_top_n.sort_values(by="score", ascending=False)
company_top_n.to_csv(config["ranks"]["company"]["top_n"])
ctx["ranks"]["company_top_n"] = company_top_n
print(company_top_n.head())
# 公司向国际技术项目排行榜
personal_top_n_en = ctx["ranks"]["personal_top_n"]
company_top_n_en = personal_top_n_en.groupby(
["company_by_email"]).sum().reset_index()
company_top_n_en.sort_values(by="score", ascending=False)
company_top_n_en.to_csv(config["ranks"]["company"]["top_n_en"])
ctx["ranks"]["company_top_n_en"] = company_top_n_en
print(company_top_n_en.head())
# 公司向中国技术项目排行榜
personal_top_n_zh_cn = ctx["ranks"]["personal_top_n_zh_cn"]
company_top_n_zh_cn = personal_top_n_zh_cn.groupby(
["company_by_email"]).sum().reset_index()
company_top_n_zh_cn.sort_values(by="score", ascending=False)
company_top_n_zh_cn.to_csv(config["ranks"]["company"]["top_n_zh_cn"])
ctx["ranks"]["company_top_n_zh_cn"] = company_top_n_zh_cn
print(company_top_n_zh_cn.head())
def calc_repo_rank(config, options, ctx): def calc_repo_rank(config, options, ctx):
...@@ -296,6 +435,9 @@ def calc_repo_rank(config, options, ctx): ...@@ -296,6 +435,9 @@ def calc_repo_rank(config, options, ctx):
def calc_personal_rank(config, options, ctx): def calc_personal_rank(config, options, ctx):
print("@load_district_zh_cn..")
load_district_zh_cn(config, ctx)
print("@load_repo_github_user_info..") print("@load_repo_github_user_info..")
load_repo_github_user_info(config, ctx) load_repo_github_user_info(config, ctx)
...@@ -307,9 +449,7 @@ def calc_personal_rank(config, options, ctx): ...@@ -307,9 +449,7 @@ def calc_personal_rank(config, options, ctx):
def calc_company_rank(config, options, ctx): def calc_company_rank(config, options, ctx):
# print("@rank_company_top_n..") rank_company_top_n(config, ctx)
# rank_company_top_n(config, ctx)
pass
def calc_ranks(config, options): def calc_ranks(config, options):
...@@ -319,4 +459,4 @@ def calc_ranks(config, options): ...@@ -319,4 +459,4 @@ def calc_ranks(config, options):
calc_repo_rank(config, options, ctx) calc_repo_rank(config, options, ctx)
calc_personal_rank(config, options, ctx) calc_personal_rank(config, options, ctx)
# calc_company_rank(config, options, ctx) calc_company_rank(config, options, ctx)
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册