yw.py 1.1 KB
Newer Older
梦想橡皮擦's avatar
梦想橡皮擦 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule


class CbSpider(CrawlSpider):
    name = 'yw'
    allowed_domains = ['write.qq.com']
    start_urls = ['https://write.qq.com/portal/article?filterType=0&page=1']
    # URL 提取规则
    rules = (
        Rule(LinkExtractor(allow=r'.*/portal/content\?caid=\d+&feedType=2&lcid=\d+$'), callback="parse_item"),
        # 寻找下一页 url 地址
        Rule(LinkExtractor(restrict_xpaths="//a[@title='下一页']"), follow=True),
    )

    def parse_start_url(self, response):
        print("---process_results---")
        yield scrapy.Request('https://write.qq.com/portal/article?filterType=0&page=1')

    def process_results(self, response, results):
        print("---process_results---")
        print(results)

    def parse_item(self, response):
        print("---parse_item---")
        print(response.url)
        title = response.css('title::text').extract()[0].strip()
        item = {}
        item["title"] = title
        yield item

    def parse(self):
        pass