so_tag_spider.py 1.3 KB
Newer Older
F
feilong 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45
# -*- coding: UTF-8 -*-
# 作者:幻灰龙
# 标题:Python 爬虫
# 描述:爬取 stackoverflow 标签

import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.settings import Settings

BASE_DIR = __loader__.name


class StackOverflowTagSpider(scrapy.Spider):
    name = "stackoverflow_tags"
    allowed_domains = ["stackoverflow.com"]
    start_urls = ['https://stackoverflow.com/tags/synonyms?page=1']
    custom_settings = {
        'ITEM_PIPELINES': {f'{BASE_DIR}.TagPipeline': 301},
        'LOG_LEVEL': 'INFO'
    }

    def __init__(self):
        self.totgal_pages = 45
        self.page_count = 0

    def parse(self, response):
        self.page_count += 1
        tags = response.css('.post-tag::text')
        for tag in tags:
            yield {'name': tag.get()}

        if self.page_count < self.totgal_pages:
            next_page_list = response.css('a.js-pagination-item::attr(href)')
            if len(next_page_list) > 0:
                next_page_item = next_page_list[len(next_page_list)-1]
                next_page = next_page_item.get()
                print('next_page:', next_page)
                yield response.follow(next_page, callback=self.parse, dont_filter=True)


if __name__ == "__main__":
    settings = Settings()
    process = CrawlerProcess()
    process.crawl(StackOverflowTagSpider)
    process.start()