添加segmentfault标签数据集

47043122 · feilong · da71c24f · 47043122 · 47043122 · 47043122
Showing with 13153 addition and 0 deletion

src/dataset/segmentfault.tag.json src/dataset/segmentfault.tag.json +13076 -0

src/main.py src/main.py +3 -0

src/tag_source/segmentfault.py src/tag_source/segmentfault.py +74 -0

未找到文件。
--- a/src/dataset/segmentfault.tag.json
+++ b/src/dataset/segmentfault.tag.json
--- a/src/main.py
+++ b/src/main.py
 import click
 import tag_source.vscode
 import tag_source.stackoverflow
+import tag_source.segmentfault

 @click.command()
 @click.option('--source')
@@ -10,6 +11,8 @@ def fetch(source):
        tag_source.vscode.fetch()
    elif source=='so':
        tag_source.stackoverflow.fetch()
+    elif source=='sf':
+        tag_source.segmentfault.fetch()

 if __name__ == '__main__':
    fetch()
\ No newline at end of file
--- a/src/tag_source/segmentfault.py
+++ b/src/tag_source/segmentfault.py
+import os
+import json
+import urllib.request
+from scrapy.selector import Selector
+from scrapy.http import HtmlResponse
+import scrapy
+
+from scrapy.crawler import CrawlerProcess
+from scrapy.settings import Settings
+
+class SegmentFaultTagSpider(scrapy.Spider):
+    name = "segmentfault_tags"
+    allowed_domains = ["segmentfault.com"]
+    start_urls = ['https://segmentfault.com/tags/all?page=1']
+    custom_settings = {
+    	'ITEM_PIPELINES':{'tag_source.segmentfault.TagPipeline': 301},
+        'LOG_LEVEL': 'INFO'
+    }
+
+    def __init__(self):
+        self.page_count = 0
+        self.totgal_pages = 654
+
+    def parse(self, response):
+        self.page_count += 1
+        tags = response.css('.widget-tag')
+        for tag in tags:
+            name = tag.xpath('h2/a/text()').get()
+            desc = tag.xpath('p/text()').get()
+            star = tag.xpath('div/strong/text()').get()
+            
+            yield {
+                'name': name,
+                'desc': desc,
+                'star': star
+            }
+        
+        next_page_list = response.css('.next')
+        if len(next_page_list)>0:
+            next_page_item = next_page_list[len(next_page_list)-1]
+            next_page = next_page_item.css('a::attr(href)').get()
+            print('next_page:', next_page)
+            yield response.follow(next_page, callback=self.parse, dont_filter=True)
+
+class TagPipeline(object):
+    def open_spider(self, spider):
+        self.file = open('dataset/segmentfault.tag.json', 'w')
+        self.file.write('[\n')
+        self.count = 0
+        self.tags = {}
+
+    def close_spider(self, spider):
+        self.file.write('\n]')
+        self.file.close()
+
+    def process_item(self, item, spider):
+        if self.tags.get(item['name']) is not None:
+            return
+        self.tags[item['name']] = True
+
+        words = []
+        if self.count>0:
+            words.append(',\n')
+        words.append('  ')
+        words.append(json.dumps(item, ensure_ascii=False).strip())
+        line = ''.join(words)
+        self.file.write(line)
+        self.count += 1
+
+def fetch():
+    settings = Settings()
+    process = CrawlerProcess()
+    process.crawl(SegmentFaultTagSpider)
+    process.start()
\ No newline at end of file