提交 0d9bb4db 编写于 作者: F feilong

添加OSChina的标签集

上级 c0719574
此差异已折叠。
......@@ -4,21 +4,27 @@ import tag_source.stackoverflow
import tag_source.segmentfault
import tag_source.infoq
import tag_source.cnblogs
import tag_source.oschina
@click.command()
@click.option('--source')
def fetch(source):
click.echo('will fetch tags from %s!' % source)
if source=='vscode':
tag_source.vscode.fetch()
elif source=='so':
tag_source.stackoverflow.fetch()
elif source=='sf':
tag_source.segmentfault.fetch()
elif source=='infoq':
tag_source.infoq.fetch()
elif source=='cnblogs':
tag_source.cnblogs.fetch()
sources = {
'vscode': lambda: tag_source.vscode.fetch(),
'so': lambda: tag_source.stackoverflow.fetch(),
'sf': lambda: tag_source.segmentfault.fetch(),
'infoq': lambda: tag_source.infoq.fetch(),
'cnblogs': lambda: tag_source.cnblogs.fetch(),
'oschina': lambda: tag_source.oschina.fetch(),
}
action = sources.get(source)
if action is not None:
action()
else:
print('source {} is not support now.'.format(source))
if __name__ == '__main__':
fetch()
\ No newline at end of file
import os
import json
import urllib.request
from scrapy.selector import Selector
from scrapy.http import HtmlResponse
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.settings import Settings
class OSChinaTagSpider(scrapy.Spider):
name = "oschina_tags"
allowed_domains = ["oschina.net"]
start_urls = ['https://www.oschina.net/question/tags?p=1']
custom_settings = {
'ITEM_PIPELINES':{'tag_source.oschina.TagPipeline': 301},
'LOG_LEVEL': 'INFO'
}
def __init__(self):
self.page_count = 0
self.totgal_pages = 606
def parse(self, response):
self.page_count += 1
tags = response.css('.tag-card')
for tag in tags:
content = tag.css('.content')
name = content.css('.header::text').get()
star = int(content.css('.meta::text').get().replace(' 个问答',''))
desc = content.css('.description::text').get()
yield {
'name': name,
'star': star,
'desc': desc
}
if self.page_count<self.totgal_pages:
next_page = response.css('.next-item::attr(href)').get()
if next_page is not None:
print('next_page:', next_page)
yield response.follow(next_page, callback=self.parse, dont_filter=True)
class TagPipeline(object):
def open_spider(self, spider):
self.file = open('dataset/oschina.tag.json', 'w')
self.file.write('[\n')
self.count = 0
self.tags = {}
def close_spider(self, spider):
self.file.write('\n]')
self.file.close()
def process_item(self, item, spider):
if self.tags.get(item['name']) is not None:
return
self.tags[item['name']] = True
words = []
if self.count>0:
words.append(',\n')
words.append(' ')
words.append(json.dumps(item, ensure_ascii=False).strip())
line = ''.join(words)
self.file.write(line)
self.count += 1
def fetch():
settings = Settings()
process = CrawlerProcess()
process.crawl(OSChinaTagSpider)
process.start()
\ No newline at end of file
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册