提交 448d9afc 编写于 作者: SoftwareTeacher's avatar SoftwareTeacher

Merge branch 'master' into 'master'

标签数据集构建

See merge request csdn/csdn-tags!6
*.json
__pycache__
\ No newline at end of file
## 任务:构建最完善的技术标签数据集
==
## 爬虫工作环境
* python版本:3.6 ,以下用 python3.6 表示
* python3.6
* 依赖:pip3.6 install pip3.6.txt
## 数据源
* [x] stackoverflow 标签爬取
* 命令:python3.6 main.py --source=so
* 输出:src/dataset/stackoverflow.tag.json
* [ ] vscode 标签爬取
* 命令:python3.6 main.py --source=vscode
* 输出:src/dataset/vscode.tag.json
\ No newline at end of file
click
cssselect
lxml
scrapy
\ No newline at end of file
此差异已折叠。
import click
import tag_source.vscode
import tag_source.stackoverflow
@click.command()
@click.option('--source')
def fetch(source):
click.echo('will fetch tags from %s!' % source)
if source=='vscode':
tag_source.vscode.fetch()
elif source=='so':
tag_source.stackoverflow.fetch()
if __name__ == '__main__':
fetch()
\ No newline at end of file
import os
import json
import urllib.request
from scrapy.selector import Selector
from scrapy.http import HtmlResponse
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.settings import Settings
class StackOverflowTagSpider(scrapy.Spider):
name = "stackoverflow_tags"
allowed_domains = ["stackoverflow.com"]
start_urls = ['https://stackoverflow.com/tags/synonyms?page=1']
custom_settings = {
'ITEM_PIPELINES':{'tag_source.stackoverflow.TagPipeline': 301},
'LOG_LEVEL': 'INFO'
}
def __init__(self):
self.totgal_pages = 45
self.page_count = 0
def parse(self, response):
self.page_count += 1
tags = response.css('.post-tag::text')
for tag in tags:
yield {
'name': tag.get()
}
if self.page_count<self.totgal_pages:
next_page_list = response.css('a.js-pagination-item::attr(href)')
if len(next_page_list)>0:
next_page_item = next_page_list[len(next_page_list)-1]
next_page = next_page_item.get()
print('next_page:', next_page)
yield response.follow(next_page, callback=self.parse, dont_filter=True)
class TagPipeline(object):
def open_spider(self, spider):
self.file = open('dataset/stackoverflow.tag.json', 'w')
self.file.write('[\n')
self.count = 0
self.tags = {}
def close_spider(self, spider):
self.file.write('\n]')
self.file.close()
def process_item(self, item, spider):
if self.tags.get(item['name']) is not None:
return
self.tags[item['name']] = True
words = []
if self.count>0:
words.append(',\n')
words.append(' ')
words.append(json.dumps(item, ensure_ascii=False).strip())
line = ''.join(words)
self.file.write(line)
self.count += 1
def fetch():
settings = Settings()
process = CrawlerProcess()
process.crawl(StackOverflowTagSpider)
process.start()
\ No newline at end of file
import os
import urllib.request
from scrapy.selector import Selector
from scrapy.http import HtmlResponse
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.settings import Settings
class VSCodeTagSpider(scrapy.Spider):
name = "vscode_tags"
allowed_domains = ["visualstudio.com"]
start_urls = ['https://marketplace.visualstudio.com/search?target=VSCode&category=All%20categories&sortBy=Installs']
def parse(self, response):
print('todo')
def fetch():
settings = Settings()
process = CrawlerProcess()
process.crawl(VSCodeTagSpider)
process.start()
\ No newline at end of file
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册