Merge branch 'master' into 'master'

标签数据集构建 See merge request !6

Merge branch 'master' into 'master'
标签数据集构建 See merge request !6
448d9afc · SoftwareTeacher · 4db64100 · da71c24f · 448d9afc · 448d9afc
7 changed file
--- a/.gitignore
+++ b/.gitignore
-*.json
+__pycache__
\ No newline at end of file
--- a/doc/标签数据集.md
+++ b/doc/标签数据集.md
+
+
+## 任务：构建最完善的技术标签数据集
+==
+
+## 爬虫工作环境
+* python版本：3.6 ，以下用 python3.6 表示
+* python3.6
+* 依赖：pip3.6 install pip3.6.txt
+
+## 数据源
+* [x] stackoverflow 标签爬取
+    * 命令：python3.6 main.py --source=so
+    * 输出：src/dataset/stackoverflow.tag.json
+* [ ] vscode 标签爬取
+    * 命令：python3.6 main.py --source=vscode
+    * 输出：src/dataset/vscode.tag.json
\ No newline at end of file
--- a/pip3.6.txt
+++ b/pip3.6.txt
+click
+cssselect
+lxml
+scrapy
\ No newline at end of file
--- a/src/dataset/stackoverflow.tag.json
+++ b/src/dataset/stackoverflow.tag.json
--- a/src/main.py
+++ b/src/main.py
+import click
+import tag_source.vscode
+import tag_source.stackoverflow
+
+@click.command()
+@click.option('--source')
+def fetch(source):
+    click.echo('will fetch tags from %s!' % source)
+    if source=='vscode':
+        tag_source.vscode.fetch()
+    elif source=='so':
+        tag_source.stackoverflow.fetch()
+
+if __name__ == '__main__':
+    fetch()
\ No newline at end of file
--- a/src/tag_source/stackoverflow.py
+++ b/src/tag_source/stackoverflow.py
+import os
+import json
+import urllib.request
+from scrapy.selector import Selector
+from scrapy.http import HtmlResponse
+import scrapy
+
+from scrapy.crawler import CrawlerProcess
+from scrapy.settings import Settings
+
+class StackOverflowTagSpider(scrapy.Spider):
+    name = "stackoverflow_tags"
+    allowed_domains = ["stackoverflow.com"]
+    start_urls = ['https://stackoverflow.com/tags/synonyms?page=1']
+    custom_settings = {
+    	'ITEM_PIPELINES':{'tag_source.stackoverflow.TagPipeline': 301},
+        'LOG_LEVEL': 'INFO'
+    }
+
+    def __init__(self):
+        self.totgal_pages = 45
+        self.page_count = 0
+
+    def parse(self, response):
+        self.page_count += 1
+        tags = response.css('.post-tag::text')
+        for tag in tags:
+            yield {
+                'name': tag.get()
+            }
+        
+        if self.page_count<self.totgal_pages:
+            next_page_list = response.css('a.js-pagination-item::attr(href)')
+            if len(next_page_list)>0:
+                next_page_item = next_page_list[len(next_page_list)-1]
+                next_page = next_page_item.get()
+                print('next_page:', next_page)
+                yield response.follow(next_page, callback=self.parse, dont_filter=True)
+
+class TagPipeline(object):
+    def open_spider(self, spider):
+        self.file = open('dataset/stackoverflow.tag.json', 'w')
+        self.file.write('[\n')
+        self.count = 0
+        self.tags = {}
+
+    def close_spider(self, spider):
+        self.file.write('\n]')
+        self.file.close()
+
+    def process_item(self, item, spider):
+        if self.tags.get(item['name']) is not None:
+            return
+        self.tags[item['name']] = True
+
+        words = []
+        if self.count>0:
+            words.append(',\n')
+        words.append('  ')
+        words.append(json.dumps(item, ensure_ascii=False).strip())
+        line = ''.join(words)
+        self.file.write(line)
+        self.count += 1
+
+def fetch():
+    settings = Settings()
+    process = CrawlerProcess()
+    process.crawl(StackOverflowTagSpider)
+    process.start()
\ No newline at end of file
--- a/src/tag_source/vscode.py
+++ b/src/tag_source/vscode.py
+import os
+import urllib.request
+from scrapy.selector import Selector
+from scrapy.http import HtmlResponse
+import scrapy
+
+from scrapy.crawler import CrawlerProcess
+from scrapy.settings import Settings
+
+class VSCodeTagSpider(scrapy.Spider):
+    name = "vscode_tags"
+    allowed_domains = ["visualstudio.com"]
+    start_urls = ['https://marketplace.visualstudio.com/search?target=VSCode&category=All%20categories&sortBy=Installs']
+
+    def parse(self, response):
+        print('todo')
+
+def fetch():
+    settings = Settings()
+    process = CrawlerProcess()
+    process.crawl(VSCodeTagSpider)
+    process.start()
\ No newline at end of file