cb.py 869 字节
Newer Older
梦想橡皮擦's avatar
梦想橡皮擦 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
import scrapy
from chebiao.items import ChebiaoItem
from urllib.parse import urlparse


class CbSpider(scrapy.Spider):
    name = 'cb'
    allowed_domains = ['chebiao.net']
    start_urls = ['http://www.chebiao.net/domestic.php', 'http://www.chebiao.net/es.php',
                  'http://www.chebiao.net/jsk.php', 'http://www.chebiao.net/other.php',
                  'http://www.chebiao.net/famous.php']

    def parse(self, response):
        down_url = "http://www.chebiao.net/download.php"
        dds = response.xpath("//div[@class='box2']/dl/dd")
        for dd in dds:
            item = ChebiaoItem()
            name = dd.xpath('./a/text()').extract()[0]
            url = dd.xpath('./a/@href').extract()[0]
            url = down_url + "?" + urlparse(url).query
            item['file_name'] = name
            item['file_url'] = url

            yield item