提交 47043122 编写于 作者: F feilong

添加segmentfault标签数据集

上级 da71c24f
此差异已折叠。
import click
import tag_source.vscode
import tag_source.stackoverflow
import tag_source.segmentfault
@click.command()
@click.option('--source')
......@@ -10,6 +11,8 @@ def fetch(source):
tag_source.vscode.fetch()
elif source=='so':
tag_source.stackoverflow.fetch()
elif source=='sf':
tag_source.segmentfault.fetch()
if __name__ == '__main__':
fetch()
\ No newline at end of file
import os
import json
import urllib.request
from scrapy.selector import Selector
from scrapy.http import HtmlResponse
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.settings import Settings
class SegmentFaultTagSpider(scrapy.Spider):
name = "segmentfault_tags"
allowed_domains = ["segmentfault.com"]
start_urls = ['https://segmentfault.com/tags/all?page=1']
custom_settings = {
'ITEM_PIPELINES':{'tag_source.segmentfault.TagPipeline': 301},
'LOG_LEVEL': 'INFO'
}
def __init__(self):
self.page_count = 0
self.totgal_pages = 654
def parse(self, response):
self.page_count += 1
tags = response.css('.widget-tag')
for tag in tags:
name = tag.xpath('h2/a/text()').get()
desc = tag.xpath('p/text()').get()
star = tag.xpath('div/strong/text()').get()
yield {
'name': name,
'desc': desc,
'star': star
}
next_page_list = response.css('.next')
if len(next_page_list)>0:
next_page_item = next_page_list[len(next_page_list)-1]
next_page = next_page_item.css('a::attr(href)').get()
print('next_page:', next_page)
yield response.follow(next_page, callback=self.parse, dont_filter=True)
class TagPipeline(object):
def open_spider(self, spider):
self.file = open('dataset/segmentfault.tag.json', 'w')
self.file.write('[\n')
self.count = 0
self.tags = {}
def close_spider(self, spider):
self.file.write('\n]')
self.file.close()
def process_item(self, item, spider):
if self.tags.get(item['name']) is not None:
return
self.tags[item['name']] = True
words = []
if self.count>0:
words.append(',\n')
words.append(' ')
words.append(json.dumps(item, ensure_ascii=False).strip())
line = ''.join(words)
self.file.write(line)
self.count += 1
def fetch():
settings = Settings()
process = CrawlerProcess()
process.crawl(SegmentFaultTagSpider)
process.start()
\ No newline at end of file
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册