tao_data.py 629 字节
Newer Older
梦想橡皮擦's avatar
52&53  
梦想橡皮擦 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
import scrapy
from tao.items import TaoItem
from scrapy.linkextractors import LinkExtractor


class TaoDataSpider(scrapy.Spider):
    name = 'tao_data'
    allowed_domains = ['taosj.com']
    start_urls = [f'https://www.taosj.com/articles?pageNo={page}' for page in range(1, 124)]

    def parse(self, response):
        link_extractor = LinkExtractor(allow=r'www\.taosj\.com/articles/\d+', restrict_css='a.report-page-list-title')
        links = link_extractor.extract_links(response)
        for l in links:
            item = {
                "url": l.url,
                "text": l.text
            }
            yield item