提交 60ff266b 编写于 作者: M MaoXianxin

Add deduplication

上级 5c1db0ca
import pandas as pd
import argparse
import numpy as np
# Filter items by programming language
# Just need to modify csv_path, result_name, language
......@@ -13,7 +14,22 @@ args = parser.parse_args()
csv1 = pd.read_csv(args.csv_path)
csv2 = pd.read_csv('recommended_projects.csv', usecols=['project path', 'project url'])
projectNames1 = []
for i in range(len(csv1)):
projectNames1.append(csv1.iloc[i, 0])
delete_project = []
for i in range(len(csv2)):
if csv2.iloc[i, 0] in projectNames1:
delete_project.append(csv2.iloc[i, 0])
for name in delete_project:
csv1.drop(csv1[csv1['project path'] == name].index, inplace=True)
csv1.index = np.asarray(list(range(0, len(csv1))))
for i in range(len(csv1['project path'])):
csv1['project path'][i] = csv1['project path'][i].lower()
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册