import os import json import urllib.request from scrapy.selector import Selector from scrapy.http import HtmlResponse import scrapy from scrapy.crawler import CrawlerProcess from scrapy.settings import Settings class StackOverflowTagSpider(scrapy.Spider): name = "stackoverflow_tags" allowed_domains = ["stackoverflow.com"] start_urls = ['https://stackoverflow.com/tags/synonyms?page=1'] custom_settings = { 'ITEM_PIPELINES':{'tag_source.stackoverflow.TagPipeline': 301}, 'LOG_LEVEL': 'INFO' } def __init__(self): self.totgal_pages = 45 self.page_count = 0 def parse(self, response): self.page_count += 1 tags = response.css('.post-tag::text') for tag in tags: yield { 'name': tag.get() } if self.page_count0: next_page_item = next_page_list[len(next_page_list)-1] next_page = next_page_item.get() print('next_page:', next_page) yield response.follow(next_page, callback=self.parse, dont_filter=True) class TagPipeline(object): def open_spider(self, spider): self.file = open('dataset/stackoverflow.tag.json', 'w') self.file.write('[\n') self.count = 0 self.tags = {} def close_spider(self, spider): self.file.write('\n]') self.file.close() def process_item(self, item, spider): if self.tags.get(item['name']) is not None: return self.tags[item['name']] = True words = [] if self.count>0: words.append(',\n') words.append(' ') words.append(json.dumps(item, ensure_ascii=False).strip()) line = ''.join(words) self.file.write(line) self.count += 1 def fetch(): settings = Settings() process = CrawlerProcess() process.crawl(StackOverflowTagSpider) process.start()