cnblogs.py 2.3 KB
Newer Older
F
feilong 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80
import os
import json
import urllib.request
from scrapy.selector import Selector
from scrapy.http import HtmlResponse
import scrapy

from scrapy.crawler import CrawlerProcess
from scrapy.settings import Settings

class CNBlogTagSpider(scrapy.Spider):
    name = "cnblogs_tags"
    allowed_domains = ["cnblogs.com"]
    start_urls = ['https://q.cnblogs.com/tag/list?pageindex=1']
    custom_settings = {
    	'ITEM_PIPELINES':{'tag_source.cnblogs.TagPipeline': 301},
        'LOG_LEVEL': 'INFO'
    }

    def __init__(self):
        self.page_count = 0
        self.totgal_pages = 520

    def parse(self, response):
        # with open('test.html', 'w') as f:
        #     f.write(response.text)

        self.page_count += 1
        tag_div = response.css('.tag-div')
        # print(tag_div)
        tags = tag_div.xpath('div/table/tr/td')
        # print('==>',tags)
        for tag in tags:
            name = tag.xpath('li/a/text()').get()
            star = tag.xpath('li/text()').get()
            star = star[1:len(star)-1]
            
            yield {
                'name': name,
                'star': star
            }
        
        if self.page_count<self.totgal_pages:
            next_page_list = response.css('#pager>a')
            if len(next_page_list)>0:
                next_page_item = next_page_list[len(next_page_list)-1]
                next_page = next_page_item.css('::attr(href)').get()
                print('next_page:', next_page)
                yield response.follow(next_page, callback=self.parse, dont_filter=True)

class TagPipeline(object):
    def open_spider(self, spider):
        self.file = open('dataset/cnblogs.tag.json', 'w')
        self.file.write('[\n')
        self.count = 0
        self.tags = {}

    def close_spider(self, spider):
        self.file.write('\n]')
        self.file.close()

    def process_item(self, item, spider):
        if self.tags.get(item['name']) is not None:
            return
        self.tags[item['name']] = True

        words = []
        if self.count>0:
            words.append(',\n')
        words.append('  ')
        words.append(json.dumps(item, ensure_ascii=False).strip())
        line = ''.join(words)
        self.file.write(line)
        self.count += 1

def fetch():
    settings = Settings()
    process = CrawlerProcess()
    process.crawl(CNBlogTagSpider)
    process.start()