提交 dec615f5 编写于 作者: F feilong

添加标签数据集构建脚本,增加stackoverflow标签爬虫

上级 4db64100
*.json
__pycache__
\ No newline at end of file
## 任务:构建最完善的技术标签数据集
==
## 爬虫工作环境
* python版本:3.6 ,以下用 python3.6 表示
* python3.6
* 依赖:pip3.6 install pip3.6.txt
## 数据源
[x] stackoverflow 标签爬取
* 命令:python3.6 main.py --source=so
* 输出:src/dataset/stackoverflow.tag.json
[ ] vscode 标签爬取
* 命令:python3.6 main.py --source=vscode
* 输出:src/dataset/vscode.tag.json
\ No newline at end of file
click
cssselect
lxml
scrapy
\ No newline at end of file
此差异已折叠。
import click
import tag_source.vscode
import tag_source.stackoverflow
@click.command()
@click.option('--source')
def fetch(source):
click.echo('will fetch tags from %s!' % source)
if source=='vscode':
tag_source.vscode.fetch()
elif source=='so':
tag_source.stackoverflow.fetch()
if __name__ == '__main__':
fetch()
\ No newline at end of file
import os
import json
import urllib.request
from scrapy.selector import Selector
from scrapy.http import HtmlResponse
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.settings import Settings
class StackOverflowTagSpider(scrapy.Spider):
name = "vscode_tags"
allowed_domains = ["visualstudio.com"]
start_urls = ['https://stackoverflow.com/tags/synonyms?page=1']
custom_settings = {
'ITEM_PIPELINES':{'tag_source.stackoverflow.TagPipeline': 301},
'LOG_LEVEL': 'INFO'
}
def __init__(self):
self.totgal_pages = 45
self.page_count = 0
def parse(self, response):
self.page_count += 1
tags = response.css('.post-tag::text')
for tag in tags:
yield {
'name': tag.get()
}
if self.page_count<self.totgal_pages:
next_page_list = response.css('a.js-pagination-item::attr(href)')
if len(next_page_list)>0:
next_page_item = next_page_list[len(next_page_list)-1]
next_page = next_page_item.get()
print('next_page:', next_page)
yield response.follow(next_page, callback=self.parse, dont_filter=True)
class TagPipeline(object):
def open_spider(self, spider):
self.file = open('dataset/stackoverflow.tag.json', 'w')
self.file.write('[\n')
self.count = 0
def close_spider(self, spider):
self.file.write('\n]')
self.file.close()
def process_item(self, item, spider):
words = []
if self.count>0:
words.append(',\n')
words.append(' ')
words.append(json.dumps(item, ensure_ascii=False).strip())
line = ''.join(words)
self.file.write(line)
self.count += 1
def fetch():
settings = Settings()
process = CrawlerProcess()
process.crawl(StackOverflowTagSpider)
process.start()
\ No newline at end of file
import os
import urllib.request
from scrapy.selector import Selector
from scrapy.http import HtmlResponse
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.settings import Settings
class CategoryItem(scrapy.Item):
name = scrapy.Field()
addr = scrapy.Field()
class TagItem(scrapy.Item):
name = scrapy.Field()
class VSCodeTagSpider(scrapy.Spider):
name = "vscode_tags"
allowed_domains = ["visualstudio.com"]
# start_urls = ['https://marketplace.visualstudio.com/search?target=VSCode&category=All%20categories&sortBy=Installs']
start_urls = ['https://stackoverflow.com/tags/synonyms?page=1']
def parse(self, response):
print('todo')
class Categoryline(object):
def process_item(self, item, spider):
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0'}
req = urllib.request.Request(url=item['addr'],headers=headers)
res = urllib.request.urlopen(req)
def fetch():
settings = Settings()
process = CrawlerProcess()
process.crawl(VSCodeTagSpider)
process.start()
\ No newline at end of file
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册