提交 eede30a9 编写于 作者: 幻灰龙's avatar 幻灰龙

Merge branch 'master' into 'master'

添加cnblogs标签数据集

See merge request csdn/csdn-tags!9
__pycache__
\ No newline at end of file
__pycache__
test.html
\ No newline at end of file
此差异已折叠。
此差异已折叠。
......@@ -2,6 +2,8 @@ import click
import tag_source.vscode
import tag_source.stackoverflow
import tag_source.segmentfault
import tag_source.infoq
import tag_source.cnblogs
@click.command()
@click.option('--source')
......@@ -13,6 +15,10 @@ def fetch(source):
tag_source.stackoverflow.fetch()
elif source=='sf':
tag_source.segmentfault.fetch()
elif source=='infoq':
tag_source.infoq.fetch()
elif source=='cnblogs':
tag_source.cnblogs.fetch()
if __name__ == '__main__':
fetch()
\ No newline at end of file
import os
import json
import urllib.request
from scrapy.selector import Selector
from scrapy.http import HtmlResponse
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.settings import Settings
class CNBlogTagSpider(scrapy.Spider):
name = "cnblogs_tags"
allowed_domains = ["cnblogs.com"]
start_urls = ['https://q.cnblogs.com/tag/list?pageindex=1']
custom_settings = {
'ITEM_PIPELINES':{'tag_source.cnblogs.TagPipeline': 301},
'LOG_LEVEL': 'INFO'
}
def __init__(self):
self.page_count = 0
self.totgal_pages = 520
def parse(self, response):
# with open('test.html', 'w') as f:
# f.write(response.text)
self.page_count += 1
tag_div = response.css('.tag-div')
# print(tag_div)
tags = tag_div.xpath('div/table/tr/td')
# print('==>',tags)
for tag in tags:
name = tag.xpath('li/a/text()').get()
star = tag.xpath('li/text()').get()
star = star[1:len(star)-1]
yield {
'name': name,
'star': star
}
if self.page_count<self.totgal_pages:
next_page_list = response.css('#pager>a')
if len(next_page_list)>0:
next_page_item = next_page_list[len(next_page_list)-1]
next_page = next_page_item.css('::attr(href)').get()
print('next_page:', next_page)
yield response.follow(next_page, callback=self.parse, dont_filter=True)
class TagPipeline(object):
def open_spider(self, spider):
self.file = open('dataset/cnblogs.tag.json', 'w')
self.file.write('[\n')
self.count = 0
self.tags = {}
def close_spider(self, spider):
self.file.write('\n]')
self.file.close()
def process_item(self, item, spider):
if self.tags.get(item['name']) is not None:
return
self.tags[item['name']] = True
words = []
if self.count>0:
words.append(',\n')
words.append(' ')
words.append(json.dumps(item, ensure_ascii=False).strip())
line = ''.join(words)
self.file.write(line)
self.count += 1
def fetch():
settings = Settings()
process = CrawlerProcess()
process.crawl(CNBlogTagSpider)
process.start()
\ No newline at end of file
import os
import json
import urllib.request
from scrapy.selector import Selector
from scrapy.http import HtmlResponse
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.settings import Settings
class InfoQSpider(scrapy.Spider):
name = "infoq_tags"
allowed_domains = ["infoq.cn"]
start_urls = ['https://www.infoq.cn/topics']
custom_settings = {
'ITEM_PIPELINES':{'tag_source.infoq.TagPipeline': 301},
'LOG_LEVEL': 'INFO'
}
def __init__(self):
self.page_count = 0
self.totgal_pages = 654
def parse(self, response):
self.page_count += 1
print(response.body)
tags = response.css('.navigation-list')
print(tags)
for tag in tags:
name = tag.xpath('h2/a/text()').get()
desc = tag.xpath('p/text()').get()
star = tag.xpath('div/strong/text()').get()
yield {
'name': name,
'desc': desc,
'star': star
}
next_page_list = response.css('.next')
if len(next_page_list)>0:
next_page_item = next_page_list[len(next_page_list)-1]
next_page = next_page_item.css('a::attr(href)').get()
print('next_page:', next_page)
yield response.follow(next_page, callback=self.parse, dont_filter=True)
class TagPipeline(object):
def open_spider(self, spider):
self.file = open('dataset/segmentfault.tag.json', 'w')
self.file.write('[\n')
self.count = 0
self.tags = {}
def close_spider(self, spider):
self.file.write('\n]')
self.file.close()
def process_item(self, item, spider):
if self.tags.get(item['name']) is not None:
return
self.tags[item['name']] = True
words = []
if self.count>0:
words.append(',\n')
words.append(' ')
words.append(json.dumps(item, ensure_ascii=False).strip())
line = ''.join(words)
self.file.write(line)
self.count += 1
def fetch():
settings = Settings()
process = CrawlerProcess()
process.crawl(InfoQSpider)
process.start()
\ No newline at end of file
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册