提交 2fd13efc 编写于 作者: 幻灰龙's avatar 幻灰龙

Merge branch 'master' into 'master'

增加带版本标签集

See merge request !14
"图像处理",
"计算机视觉",
"自然语言处理",
"语音识别",
"机器翻译",
"机器学习",
"神经网络",
"深度学习",
"迁移学习",
"强化学习",
"集成学习",
"联邦学习",
"人机对话",
"知识图谱",
"图计算",
"情感分析",
"目标检测",
"目标跟踪",
"生成对抗网络",
"协同过滤",
"语言模型",
"智能推荐",
"tensorflow",
"pytorch",
"oneflow",
"opencv",
"keras",
"caffe",
"sklearn",
"mxnet",
"mlnet",
"mllib",
"paddlepaddle",
"jittor",
"deeplearning4j",
"mindspore",
"cnn",
"rnn",
"lstm",
"gru",
"dnn",
"mnn",
"alexnet",
"googlenet",
"deepfm",
"boosting",
"tensorrt",
"yolo",
"elmo",
"transformer",
"word2vec",
"doc2vec",
"glove",
"xlnet",
"bert",
"nnlm",
"lda主题模型",
"deepdive",
[
]
\ No newline at end of file
......@@ -11,42 +11,63 @@ from scrapy.settings import Settings
class InfoQSpider(scrapy.Spider):
name = "infoq_tags"
allowed_domains = ["infoq.cn"]
start_urls = ['https://www.infoq.cn/topics']
start_urls = ['https://www.infoq.cn/public/v1/topic/getList']
custom_settings = {
'ITEM_PIPELINES':{'tag_source.infoq.TagPipeline': 301},
'LOG_LEVEL': 'INFO'
'LOG_LEVEL': 'INFO',
'COOKIES_ENABLED': True,
}
def __init__(self):
self.page_count = 0
self.totgal_pages = 654
def start_requests(self):
# 浏览器用户代理
headers = {
'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.72 Safari/537.36 Edg/90.0.818.41'
}
# 指定cookies
cookies = {
'LF_ID': '1596546004203-7769949-7289805',
'GRID': '8ed2875-174b7fd-6ad9862-31c4b7f',
'SERVERID': '3431a294a18c59fc8f5805662e2bd51e|1619058525|1619058010'
}
urls = [
'https://www.infoq.cn/public/v1/topic/getList'
]
for url in urls:
yield scrapy.Request(url=url, headers=headers, cookies=cookies, callback=self.parse)
def parse(self, response):
self.page_count += 1
print(response.body)
tags = response.css('.navigation-list')
print(tags)
for tag in tags:
name = tag.xpath('h2/a/text()').get()
desc = tag.xpath('p/text()').get()
star = tag.xpath('div/strong/text()').get()
with open('test.html', 'w') as f:
f.write(response.text)
# tags = response.css('li>div>.title')
# print(tags)
# for tag in tags:
# name = tag.xpath('h2/a/text()').get()
# desc = tag.xpath('p/text()').get()
# star = tag.xpath('div/strong/text()').get()
yield {
'name': name,
'desc': desc,
'star': star
}
# yield {
# 'name': name,
# 'desc': desc,
# 'star': star
# }
next_page_list = response.css('.next')
if len(next_page_list)>0:
next_page_item = next_page_list[len(next_page_list)-1]
next_page = next_page_item.css('a::attr(href)').get()
print('next_page:', next_page)
yield response.follow(next_page, callback=self.parse, dont_filter=True)
# next_page_list = response.css('.next')
# if len(next_page_list)>0:
# next_page_item = next_page_list[len(next_page_list)-1]
# next_page = next_page_item.css('a::attr(href)').get()
# print('next_page:', next_page)
# yield response.follow(next_page, callback=self.parse, dont_filter=True)
class TagPipeline(object):
def open_spider(self, spider):
self.file = open('dataset/segmentfault.tag.json', 'w')
self.file = open('dataset/infoq.tag.json', 'w')
self.file.write('[\n')
self.count = 0
self.tags = {}
......
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册