提交 94d7588d 编写于 作者: F feilong

标签爬虫去重

上级 7ce3242c
此差异已折叠。
......@@ -42,12 +42,17 @@ class TagPipeline(object):
self.file = open('dataset/stackoverflow.tag.json', 'w')
self.file.write('[\n')
self.count = 0
self.tags = {}
def close_spider(self, spider):
self.file.write('\n]')
self.file.close()
def process_item(self, item, spider):
if self.tags.get(item['name']) is not None:
return
self.tags[item['name']] = True
words = []
if self.count>0:
words.append(',\n')
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册