# -*- coding: UTF-8 -*- # 作者:huanhuilong # 标题:Python 爬虫 # 描述:爬取 stackoverflow 标签 import scrapy from scrapy.crawler import CrawlerProcess from scrapy.settings import Settings BASE_DIR = __loader__.name class StackOverflowTagSpider(scrapy.Spider): name = "stackoverflow_tags" allowed_domains = ["stackoverflow.com"] start_urls = ['https://stackoverflow.com/tags/synonyms?page=1'] custom_settings = { 'ITEM_PIPELINES': {f'{BASE_DIR}.TagPipeline': 301}, 'LOG_LEVEL': 'INFO' } def __init__(self): self.totgal_pages = 45 self.page_count = 0 def parse(self, response): self.page_count += 1 tags = response.css('.post-tag::text') for tag in tags: yield {'name': tag.get()} if self.page_count < self.totgal_pages: next_page_list = response.css('a.js-pagination-item::attr(href)') if len(next_page_list) > 0: next_page_item = next_page_list[len(next_page_list)-1] next_page = next_page_item.get() print('next_page:', next_page) yield response.follow(next_page, callback=self.parse, dont_filter=True) if __name__ == "__main__": settings = Settings() process = CrawlerProcess() process.crawl(StackOverflowTagSpider) process.start()