提交 2fd13efc 编写于 作者: 幻灰龙's avatar 幻灰龙

Merge branch 'master' into 'master'

增加带版本标签集

See merge request csdn/csdn-tags!14
"图像处理",
"计算机视觉",
"自然语言处理",
"语音识别",
"机器翻译",
"机器学习",
"神经网络",
"深度学习",
"迁移学习",
"强化学习",
"集成学习",
"联邦学习",
"人机对话",
"知识图谱",
"图计算",
"情感分析",
"目标检测",
"目标跟踪",
"生成对抗网络",
"协同过滤",
"语言模型",
"智能推荐",
"tensorflow",
"pytorch",
"oneflow",
"opencv",
"keras",
"caffe",
"sklearn",
"mxnet",
"mlnet",
"mllib",
"paddlepaddle",
"jittor",
"deeplearning4j",
"mindspore",
"cnn",
"rnn",
"lstm",
"gru",
"dnn",
"mnn",
"alexnet",
"googlenet",
"deepfm",
"boosting",
"tensorrt",
"yolo",
"elmo",
"transformer",
"word2vec",
"doc2vec",
"glove",
"xlnet",
"bert",
"nnlm",
"lda主题模型",
"deepdive",
[
]
\ No newline at end of file
......@@ -11,42 +11,63 @@ from scrapy.settings import Settings
class InfoQSpider(scrapy.Spider):
name = "infoq_tags"
allowed_domains = ["infoq.cn"]
start_urls = ['https://www.infoq.cn/topics']
start_urls = ['https://www.infoq.cn/public/v1/topic/getList']
custom_settings = {
'ITEM_PIPELINES':{'tag_source.infoq.TagPipeline': 301},
'LOG_LEVEL': 'INFO'
'LOG_LEVEL': 'INFO',
'COOKIES_ENABLED': True,
}
def __init__(self):
self.page_count = 0
self.totgal_pages = 654
def start_requests(self):
# 浏览器用户代理
headers = {
'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.72 Safari/537.36 Edg/90.0.818.41'
}
# 指定cookies
cookies = {
'LF_ID': '1596546004203-7769949-7289805',
'GRID': '8ed2875-174b7fd-6ad9862-31c4b7f',
'SERVERID': '3431a294a18c59fc8f5805662e2bd51e|1619058525|1619058010'
}
urls = [
'https://www.infoq.cn/public/v1/topic/getList'
]
for url in urls:
yield scrapy.Request(url=url, headers=headers, cookies=cookies, callback=self.parse)
def parse(self, response):
self.page_count += 1
print(response.body)
tags = response.css('.navigation-list')
print(tags)
for tag in tags:
name = tag.xpath('h2/a/text()').get()
desc = tag.xpath('p/text()').get()
star = tag.xpath('div/strong/text()').get()
with open('test.html', 'w') as f:
f.write(response.text)
# tags = response.css('li>div>.title')
# print(tags)
# for tag in tags:
# name = tag.xpath('h2/a/text()').get()
# desc = tag.xpath('p/text()').get()
# star = tag.xpath('div/strong/text()').get()
yield {
'name': name,
'desc': desc,
'star': star
}
# yield {
# 'name': name,
# 'desc': desc,
# 'star': star
# }
next_page_list = response.css('.next')
if len(next_page_list)>0:
next_page_item = next_page_list[len(next_page_list)-1]
next_page = next_page_item.css('a::attr(href)').get()
print('next_page:', next_page)
yield response.follow(next_page, callback=self.parse, dont_filter=True)
# next_page_list = response.css('.next')
# if len(next_page_list)>0:
# next_page_item = next_page_list[len(next_page_list)-1]
# next_page = next_page_item.css('a::attr(href)').get()
# print('next_page:', next_page)
# yield response.follow(next_page, callback=self.parse, dont_filter=True)
class TagPipeline(object):
def open_spider(self, spider):
self.file = open('dataset/segmentfault.tag.json', 'w')
self.file = open('dataset/infoq.tag.json', 'w')
self.file.write('[\n')
self.count = 0
self.tags = {}
......
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册