tao_data.py

import scrapy
from tao.items import TaoItem
from scrapy.linkextractors import LinkExtractor


class TaoDataSpider(scrapy.Spider):
    name = 'tao_data'
    allowed_domains = ['taosj.com']
    start_urls = [f'https://www.taosj.com/articles?pageNo={page}' for page in range(1, 124)]

    def parse(self, response):
        link_extractor = LinkExtractor(allow=r'www\.taosj\.com/articles/\d+', restrict_css='a.report-page-list-title')
        links = link_extractor.extract_links(response)
        for l in links:
            item = {
                "url": l.url,
                "text": l.text
            }
            yield item