提交 1d7bbf98 编写于 作者: F feilong

增加带版本标签集

上级 5dd23caa
"图像处理",
"计算机视觉",
"自然语言处理",
"语音识别",
"机器翻译",
"机器学习",
"神经网络",
"深度学习",
"迁移学习",
"强化学习",
"集成学习",
"联邦学习",
"人机对话",
"知识图谱",
"图计算",
"情感分析",
"目标检测",
"目标跟踪",
"生成对抗网络",
"协同过滤",
"语言模型",
"智能推荐",
"tensorflow",
"pytorch",
"oneflow",
"opencv",
"keras",
"caffe",
"sklearn",
"mxnet",
"mlnet",
"mllib",
"paddlepaddle",
"jittor",
"deeplearning4j",
"mindspore",
"cnn",
"rnn",
"lstm",
"gru",
"dnn",
"mnn",
"alexnet",
"googlenet",
"deepfm",
"boosting",
"tensorrt",
"yolo",
"elmo",
"transformer",
"word2vec",
"doc2vec",
"glove",
"xlnet",
"bert",
"nnlm",
"lda主题模型",
"deepdive",
[
]
\ No newline at end of file
...@@ -11,42 +11,63 @@ from scrapy.settings import Settings ...@@ -11,42 +11,63 @@ from scrapy.settings import Settings
class InfoQSpider(scrapy.Spider): class InfoQSpider(scrapy.Spider):
name = "infoq_tags" name = "infoq_tags"
allowed_domains = ["infoq.cn"] allowed_domains = ["infoq.cn"]
start_urls = ['https://www.infoq.cn/topics'] start_urls = ['https://www.infoq.cn/public/v1/topic/getList']
custom_settings = { custom_settings = {
'ITEM_PIPELINES':{'tag_source.infoq.TagPipeline': 301}, 'ITEM_PIPELINES':{'tag_source.infoq.TagPipeline': 301},
'LOG_LEVEL': 'INFO' 'LOG_LEVEL': 'INFO',
'COOKIES_ENABLED': True,
} }
def __init__(self): def __init__(self):
self.page_count = 0 self.page_count = 0
self.totgal_pages = 654 self.totgal_pages = 654
def start_requests(self):
# 浏览器用户代理
headers = {
'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.72 Safari/537.36 Edg/90.0.818.41'
}
# 指定cookies
cookies = {
'LF_ID': '1596546004203-7769949-7289805',
'GRID': '8ed2875-174b7fd-6ad9862-31c4b7f',
'SERVERID': '3431a294a18c59fc8f5805662e2bd51e|1619058525|1619058010'
}
urls = [
'https://www.infoq.cn/public/v1/topic/getList'
]
for url in urls:
yield scrapy.Request(url=url, headers=headers, cookies=cookies, callback=self.parse)
def parse(self, response): def parse(self, response):
self.page_count += 1 self.page_count += 1
print(response.body) with open('test.html', 'w') as f:
tags = response.css('.navigation-list') f.write(response.text)
print(tags)
for tag in tags: # tags = response.css('li>div>.title')
name = tag.xpath('h2/a/text()').get() # print(tags)
desc = tag.xpath('p/text()').get() # for tag in tags:
star = tag.xpath('div/strong/text()').get() # name = tag.xpath('h2/a/text()').get()
# desc = tag.xpath('p/text()').get()
# star = tag.xpath('div/strong/text()').get()
yield { # yield {
'name': name, # 'name': name,
'desc': desc, # 'desc': desc,
'star': star # 'star': star
} # }
next_page_list = response.css('.next') # next_page_list = response.css('.next')
if len(next_page_list)>0: # if len(next_page_list)>0:
next_page_item = next_page_list[len(next_page_list)-1] # next_page_item = next_page_list[len(next_page_list)-1]
next_page = next_page_item.css('a::attr(href)').get() # next_page = next_page_item.css('a::attr(href)').get()
print('next_page:', next_page) # print('next_page:', next_page)
yield response.follow(next_page, callback=self.parse, dont_filter=True) # yield response.follow(next_page, callback=self.parse, dont_filter=True)
class TagPipeline(object): class TagPipeline(object):
def open_spider(self, spider): def open_spider(self, spider):
self.file = open('dataset/segmentfault.tag.json', 'w') self.file = open('dataset/infoq.tag.json', 'w')
self.file.write('[\n') self.file.write('[\n')
self.count = 0 self.count = 0
self.tags = {} self.tags = {}
......
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册