oschina.py 2.2 KB
Newer Older
F
feilong 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74
import os
import json
import urllib.request
from scrapy.selector import Selector
from scrapy.http import HtmlResponse
import scrapy

from scrapy.crawler import CrawlerProcess
from scrapy.settings import Settings

class OSChinaTagSpider(scrapy.Spider):
    name = "oschina_tags"
    allowed_domains = ["oschina.net"]
    start_urls = ['https://www.oschina.net/question/tags?p=1']
    custom_settings = {
    	'ITEM_PIPELINES':{'tag_source.oschina.TagPipeline': 301},
        'LOG_LEVEL': 'INFO'
    }

    def __init__(self):
        self.page_count = 0
        self.totgal_pages = 606

    def parse(self, response):
        self.page_count += 1
        tags = response.css('.tag-card')
        for tag in tags:
            content = tag.css('.content')
            name = content.css('.header::text').get()
            star = int(content.css('.meta::text').get().replace(' 个问答',''))
            desc = content.css('.description::text').get()
            
            yield {
                'name': name,
                'star': star,
                'desc': desc
            }
        
        if self.page_count<self.totgal_pages:
            next_page = response.css('.next-item::attr(href)').get()
            if next_page is not None:
                print('next_page:', next_page)
                yield response.follow(next_page, callback=self.parse, dont_filter=True)

class TagPipeline(object):
    def open_spider(self, spider):
        self.file = open('dataset/oschina.tag.json', 'w')
        self.file.write('[\n')
        self.count = 0
        self.tags = {}

    def close_spider(self, spider):
        self.file.write('\n]')
        self.file.close()

    def process_item(self, item, spider):
        if self.tags.get(item['name']) is not None:
            return
        self.tags[item['name']] = True

        words = []
        if self.count>0:
            words.append(',\n')
        words.append('  ')
        words.append(json.dumps(item, ensure_ascii=False).strip())
        line = ''.join(words)
        self.file.write(line)
        self.count += 1

def fetch():
    settings = Settings()
    process = CrawlerProcess()
    process.crawl(OSChinaTagSpider)
    process.start()