添加OSChina的标签集

0d9bb4db · feilong · c0719574 · 0d9bb4db · 0d9bb4db · 0d9bb4db
展开全部隐藏空白更改
内联并排

Showing with 8007 addition and 10 deletion

src/dataset/oschina.tag.json src/dataset/oschina.tag.json +7917 -0

src/main.py src/main.py +16 -10

src/tag_source/oschina.py src/tag_source/oschina.py +74 -0

未找到文件。
--- a/src/dataset/oschina.tag.json
+++ b/src/dataset/oschina.tag.json
--- a/src/main.py
+++ b/src/main.py
@@ -4,21 +4,27 @@ import tag_source.stackoverflow
 import tag_source.segmentfault
 import tag_source.infoq
 import tag_source.cnblogs
+import tag_source.oschina

 @click.command()
 @click.option('--source')
 def fetch(source):
    click.echo('will fetch tags from %s!' % source)
-    if source=='vscode':
-        tag_source.vscode.fetch()
-    elif source=='so':
-        tag_source.stackoverflow.fetch()
-    elif source=='sf':
-        tag_source.segmentfault.fetch()
-    elif source=='infoq':
-        tag_source.infoq.fetch()
-    elif source=='cnblogs':
-        tag_source.cnblogs.fetch()
+
+    sources = {
+        'vscode': lambda: tag_source.vscode.fetch(),
+        'so': lambda: tag_source.stackoverflow.fetch(),
+        'sf': lambda: tag_source.segmentfault.fetch(),
+        'infoq': lambda: tag_source.infoq.fetch(),
+        'cnblogs': lambda: tag_source.cnblogs.fetch(),
+        'oschina': lambda: tag_source.oschina.fetch(),
+    }
+
+    action = sources.get(source)
+    if action is not None:
+        action()
+    else:
+        print('source {} is not support now.'.format(source))

 if __name__ == '__main__':
    fetch()
\ No newline at end of file
--- a/src/tag_source/oschina.py
+++ b/src/tag_source/oschina.py
+import os
+import json
+import urllib.request
+from scrapy.selector import Selector
+from scrapy.http import HtmlResponse
+import scrapy
+
+from scrapy.crawler import CrawlerProcess
+from scrapy.settings import Settings
+
+class OSChinaTagSpider(scrapy.Spider):
+    name = "oschina_tags"
+    allowed_domains = ["oschina.net"]
+    start_urls = ['https://www.oschina.net/question/tags?p=1']
+    custom_settings = {
+    	'ITEM_PIPELINES':{'tag_source.oschina.TagPipeline': 301},
+        'LOG_LEVEL': 'INFO'
+    }
+
+    def __init__(self):
+        self.page_count = 0
+        self.totgal_pages = 606
+
+    def parse(self, response):
+        self.page_count += 1
+        tags = response.css('.tag-card')
+        for tag in tags:
+            content = tag.css('.content')
+            name = content.css('.header::text').get()
+            star = int(content.css('.meta::text').get().replace(' 个问答',''))
+            desc = content.css('.description::text').get()
+            
+            yield {
+                'name': name,
+                'star': star,
+                'desc': desc
+            }
+        
+        if self.page_count<self.totgal_pages:
+            next_page = response.css('.next-item::attr(href)').get()
+            if next_page is not None:
+                print('next_page:', next_page)
+                yield response.follow(next_page, callback=self.parse, dont_filter=True)
+
+class TagPipeline(object):
+    def open_spider(self, spider):
+        self.file = open('dataset/oschina.tag.json', 'w')
+        self.file.write('[\n')
+        self.count = 0
+        self.tags = {}
+
+    def close_spider(self, spider):
+        self.file.write('\n]')
+        self.file.close()
+
+    def process_item(self, item, spider):
+        if self.tags.get(item['name']) is not None:
+            return
+        self.tags[item['name']] = True
+
+        words = []
+        if self.count>0:
+            words.append(',\n')
+        words.append('  ')
+        words.append(json.dumps(item, ensure_ascii=False).strip())
+        line = ''.join(words)
+        self.file.write(line)
+        self.count += 1
+
+def fetch():
+    settings = Settings()
+    process = CrawlerProcess()
+    process.crawl(OSChinaTagSpider)
+    process.start()
\ No newline at end of file