提交 8f48db32 编写于 作者: M MaoXianxin

process github link

上级 dee4af17
......@@ -12,3 +12,5 @@
aclImdb/README
*.csv
*.json
import pandas as pd
csv = pd.read_csv('test.csv')
print(len(csv))
group_names = []
project_names = []
for i in range(len(csv['project path'])):
group_names.append(csv['project path'][i].split('/')[3])
project_names.append(csv['project path'][i].split('/')[4])
csv['group_name'] = group_names
csv['project_name'] = project_names
csv.to_csv('test.csv', columns=['project path', 'visits', 'group_name', 'project_name'], index=False)
print(len(csv))
\ No newline at end of file
import pandas as pd
csv = pd.read_csv('test.csv')
print(len(csv))
delete_group_row = []
for i in range(len(csv)):
if len(csv['project path'].loc[i].split('/')) <= 4:
delete_group_row.append(i)
for index in delete_group_row:
csv.drop(index=index, inplace=True)
csv.to_csv('test.csv', columns=['project path', 'visits'], index=False)
print(len(csv))
\ No newline at end of file
import pandas as pd
csv = pd.read_csv('test.csv')
print(len(csv))
delete_rows = []
for i in range(len(csv)):
if 'login' in csv['project path'][i] or 'settings' in csv['project path'][i]:
delete_rows.append(i)
for index in delete_rows:
csv.drop(index=index, inplace=True)
csv.to_csv('test.csv', columns=['project path', 'visits'], index=False)
print(len(csv))
\ No newline at end of file
import pandas as pd
csv = pd.read_csv('/home/mao/Documents/githublinks.csv')
print(len(csv))
delete_rows_index = []
for i in range(len(csv)):
if not 'github' in str(csv['project path'].loc[i]):
delete_rows_index.append(i)
for index in delete_rows_index:
csv.drop(index=index, inplace=True)
csv.to_csv('test.csv', columns=['project path', 'visits'], index=False)
print(len(csv))
\ No newline at end of file
import pandas as pd
csv = pd.read_csv('test.csv')
print(len(csv))
csv_drop_duplicates = csv.drop_duplicates(subset=['project_name'])
csv_drop_duplicates.to_csv('test.csv', columns=['project path', 'visits', 'group_name', 'project_name'], index=False)
print(len(csv_drop_duplicates))
\ No newline at end of file
import pandas as pd
csv = pd.read_csv('test.csv')
print(len(csv))
replace_tree_row = []
for i in range(len(csv)):
if '#' in csv['project path'][i]:
replace_tree_row.append(i)
for index in replace_tree_row:
for index_, item in enumerate(csv['project path'][index].split('#')):
csv['project path'][index] = csv['project path'][21].split('#')[0]
break
csv.to_csv('test.csv', columns=['project path', 'visits'], index=False)
print(len(csv))
\ No newline at end of file
import pandas as pd
csv = pd.read_csv('test.csv')
print(len(csv))
replace_tree_row = []
for i in range(len(csv)):
if '%' in csv['project path'][i]:
replace_tree_row.append(i)
for index in replace_tree_row:
for index_, item in enumerate(csv['project path'][index].split('%')):
csv['project path'][index] = csv['project path'][index].split('%')[0]
break
csv.to_csv('test.csv', columns=['project path', 'visits'], index=False)
print(len(csv))
\ No newline at end of file
import pandas as pd
csv = pd.read_csv('test.csv')
print(len(csv))
for i in range(len(csv)):
if '.git' in csv['project path'][i]:
csv['project path'][i] = str(csv['project path'][i]).replace('.git', '')
elif ';' in csv['project path'][i]:
csv['project path'][i] = str(csv['project path'][i]).replace(';', '')
csv.to_csv('test.csv', columns=['project path', 'visits'], index=False)
print(len(csv))
import pandas as pd
csv = pd.read_csv('test.csv')
print(len(csv))
replace_archive_row = []
for i in range(len(csv)):
if 'archive' in csv['project path'][i]:
replace_archive_row.append(i)
for index in replace_archive_row:
for index_, item in enumerate(csv['project path'][index].split('/')):
if 'archive' in item:
csv['project path'][index] = '/'.join(csv['project path'][index].split('/')[0:index_])
break
csv.to_csv('test.csv', columns=['project path', 'visits'], index=False)
print(len(csv))
\ No newline at end of file
import pandas as pd
csv = pd.read_csv('test.csv')
print(len(csv))
replace_blob_row = []
for i in range(len(csv)):
if 'blob' in csv['project path'][i]:
replace_blob_row.append(i)
for index in replace_blob_row:
for index_, item in enumerate(csv['project path'][index].split('/')):
if item == 'blob':
csv['project path'][index] = '/'.join(csv['project path'][index].split('/')[0:index_])
break
csv.to_csv('test.csv', columns=['project path', 'visits'], index=False)
print(len(csv))
\ No newline at end of file
import pandas as pd
csv = pd.read_csv('test.csv')
print(len(csv))
replace_tree_row = []
for i in range(len(csv)):
if 'issues' in csv['project path'][i]:
replace_tree_row.append(i)
for index in replace_tree_row:
for index_, item in enumerate(csv['project path'][index].split('/')):
if 'issues' in item:
csv['project path'][index] = '/'.join(csv['project path'][index].split('/')[0:index_])
break
csv.to_csv('test.csv', columns=['project path', 'visits'], index=False)
print(len(csv))
\ No newline at end of file
import pandas as pd
csv = pd.read_csv('test.csv')
print(len(csv))
replace_release_row = []
for i in range(len(csv)):
if 'release' in csv['project path'][i]:
replace_release_row.append(i)
for index in replace_release_row:
for index_, item in enumerate(csv['project path'][index].split('/')):
if item == 'releases':
csv['project path'][index] = '/'.join(csv['project path'][index].split('/')[0:index_])
break
csv.to_csv('test.csv', columns=['project path', 'visits'], index=False)
print(len(csv))
\ No newline at end of file
import pandas as pd
csv = pd.read_csv('test.csv')
print(len(csv))
replace_tree_row = []
for i in range(len(csv)):
if 'tree' in csv['project path'][i]:
replace_tree_row.append(i)
for index in replace_tree_row:
for index_, item in enumerate(csv['project path'][index].split('/')):
if item == 'tree':
csv['project path'][index] = '/'.join(csv['project path'][index].split('/')[0:index_])
break
csv.to_csv('test.csv', columns=['project path', 'visits'], index=False)
print(len(csv))
\ No newline at end of file
import pandas as pd
csv = pd.read_csv('test.csv')
print(len(csv))
replace_wiki_row = []
for i in range(len(csv)):
if 'wiki' in csv['project path'][i]:
replace_wiki_row.append(i)
for index in replace_wiki_row:
for index_, item in enumerate(csv['project path'][index].split('/')):
if item == 'wiki':
csv['project path'][index] = '/'.join(csv['project path'][index].split('/')[0:index_])
break
csv.to_csv('test.csv', columns=['project path', 'visits'], index=False)
print(len(csv))
\ No newline at end of file
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册