import logging import tempfile import warnings import unittest from twisted.internet import defer import twisted.trial.unittest import scrapy from scrapy.crawler import Crawler, CrawlerRunner, CrawlerProcess from scrapy.settings import Settings, default_settings from scrapy.spiderloader import SpiderLoader from scrapy.utils.log import configure_logging, get_scrapy_root_handler from scrapy.utils.spider import DefaultSpider from scrapy.utils.misc import load_object from scrapy.extensions.throttle import AutoThrottle from scrapy.extensions import telnet class BaseCrawlerTest(unittest.TestCase): def assertOptionIsDefault(self, settings, key): self.assertIsInstance(settings, Settings) self.assertEqual(settings[key], getattr(default_settings, key)) class CrawlerTestCase(BaseCrawlerTest): def setUp(self): self.crawler = Crawler(DefaultSpider, Settings()) def test_deprecated_attribute_spiders(self): with warnings.catch_warnings(record=True) as w: spiders = self.crawler.spiders self.assertEqual(len(w), 1) self.assertIn("Crawler.spiders", str(w[0].message)) sl_cls = load_object(self.crawler.settings['SPIDER_LOADER_CLASS']) self.assertIsInstance(spiders, sl_cls) self.crawler.spiders self.assertEqual(len(w), 1, "Warn deprecated access only once") def test_populate_spidercls_settings(self): spider_settings = {'TEST1': 'spider', 'TEST2': 'spider'} project_settings = {'TEST1': 'project', 'TEST3': 'project'} class CustomSettingsSpider(DefaultSpider): custom_settings = spider_settings settings = Settings() settings.setdict(project_settings, priority='project') crawler = Crawler(CustomSettingsSpider, settings) self.assertEqual(crawler.settings.get('TEST1'), 'spider') self.assertEqual(crawler.settings.get('TEST2'), 'spider') self.assertEqual(crawler.settings.get('TEST3'), 'project') self.assertFalse(settings.frozen) self.assertTrue(crawler.settings.frozen) def test_crawler_accepts_dict(self): crawler = Crawler(DefaultSpider, {'foo': 'bar'}) self.assertEqual(crawler.settings['foo'], 'bar') self.assertOptionIsDefault(crawler.settings, 'RETRY_ENABLED') def test_crawler_accepts_None(self): crawler = Crawler(DefaultSpider) self.assertOptionIsDefault(crawler.settings, 'RETRY_ENABLED') class SpiderSettingsTestCase(unittest.TestCase): def test_spider_custom_settings(self): class MySpider(scrapy.Spider): name = 'spider' custom_settings = { 'AUTOTHROTTLE_ENABLED': True } crawler = Crawler(MySpider, {}) enabled_exts = [e.__class__ for e in crawler.extensions.middlewares] self.assertIn(AutoThrottle, enabled_exts) class CrawlerLoggingTestCase(unittest.TestCase): def test_no_root_handler_installed(self): handler = get_scrapy_root_handler() if handler is not None: logging.root.removeHandler(handler) class MySpider(scrapy.Spider): name = 'spider' crawler = Crawler(MySpider, {}) assert get_scrapy_root_handler() is None def test_spider_custom_settings_log_level(self): with tempfile.NamedTemporaryFile() as log_file: class MySpider(scrapy.Spider): name = 'spider' custom_settings = { 'LOG_LEVEL': 'INFO', 'LOG_FILE': log_file.name, # disable telnet if not available to avoid an extra warning 'TELNETCONSOLE_ENABLED': telnet.TWISTED_CONCH_AVAILABLE, } configure_logging() self.assertEqual(get_scrapy_root_handler().level, logging.DEBUG) crawler = Crawler(MySpider, {}) self.assertEqual(get_scrapy_root_handler().level, logging.INFO) info_count = crawler.stats.get_value('log_count/INFO') logging.debug('debug message') logging.info('info message') logging.warning('warning message') logging.error('error message') logged = log_file.read().decode('utf8') self.assertNotIn('debug message', logged) self.assertIn('info message', logged) self.assertIn('warning message', logged) self.assertIn('error message', logged) self.assertEqual(crawler.stats.get_value('log_count/ERROR'), 1) self.assertEqual(crawler.stats.get_value('log_count/WARNING'), 1) self.assertEqual( crawler.stats.get_value('log_count/INFO') - info_count, 1) self.assertEqual(crawler.stats.get_value('log_count/DEBUG', 0), 0) class SpiderLoaderWithWrongInterface(object): def unneeded_method(self): pass class CustomSpiderLoader(SpiderLoader): pass class CrawlerRunnerTestCase(BaseCrawlerTest): def test_spider_manager_verify_interface(self): settings = Settings({ 'SPIDER_LOADER_CLASS': 'tests.test_crawler.SpiderLoaderWithWrongInterface' }) with warnings.catch_warnings(record=True) as w, \ self.assertRaises(AttributeError): CrawlerRunner(settings) self.assertEqual(len(w), 1) self.assertIn("SPIDER_LOADER_CLASS", str(w[0].message)) self.assertIn("scrapy.interfaces.ISpiderLoader", str(w[0].message)) def test_crawler_runner_accepts_dict(self): runner = CrawlerRunner({'foo': 'bar'}) self.assertEqual(runner.settings['foo'], 'bar') self.assertOptionIsDefault(runner.settings, 'RETRY_ENABLED') def test_crawler_runner_accepts_None(self): runner = CrawlerRunner() self.assertOptionIsDefault(runner.settings, 'RETRY_ENABLED') def test_deprecated_attribute_spiders(self): with warnings.catch_warnings(record=True) as w: runner = CrawlerRunner(Settings()) spiders = runner.spiders self.assertEqual(len(w), 1) self.assertIn("CrawlerRunner.spiders", str(w[0].message)) self.assertIn("CrawlerRunner.spider_loader", str(w[0].message)) sl_cls = load_object(runner.settings['SPIDER_LOADER_CLASS']) self.assertIsInstance(spiders, sl_cls) def test_spidermanager_deprecation(self): with warnings.catch_warnings(record=True) as w: runner = CrawlerRunner({ 'SPIDER_MANAGER_CLASS': 'tests.test_crawler.CustomSpiderLoader' }) self.assertIsInstance(runner.spider_loader, CustomSpiderLoader) self.assertEqual(len(w), 1) self.assertIn('Please use SPIDER_LOADER_CLASS', str(w[0].message)) class CrawlerProcessTest(BaseCrawlerTest): def test_crawler_process_accepts_dict(self): runner = CrawlerProcess({'foo': 'bar'}) self.assertEqual(runner.settings['foo'], 'bar') self.assertOptionIsDefault(runner.settings, 'RETRY_ENABLED') def test_crawler_process_accepts_None(self): runner = CrawlerProcess() self.assertOptionIsDefault(runner.settings, 'RETRY_ENABLED') class ExceptionSpider(scrapy.Spider): name = 'exception' @classmethod def from_crawler(cls, crawler, *args, **kwargs): raise ValueError('Exception in from_crawler method') class NoRequestsSpider(scrapy.Spider): name = 'no_request' def start_requests(self): return [] class CrawlerRunnerHasSpider(twisted.trial.unittest.TestCase): @defer.inlineCallbacks def test_crawler_runner_bootstrap_successful(self): runner = CrawlerRunner() yield runner.crawl(NoRequestsSpider) self.assertEqual(runner.bootstrap_failed, False) @defer.inlineCallbacks def test_crawler_runner_bootstrap_successful_for_several(self): runner = CrawlerRunner() yield runner.crawl(NoRequestsSpider) yield runner.crawl(NoRequestsSpider) self.assertEqual(runner.bootstrap_failed, False) @defer.inlineCallbacks def test_crawler_runner_bootstrap_failed(self): runner = CrawlerRunner() try: yield runner.crawl(ExceptionSpider) except ValueError: pass else: self.fail('Exception should be raised from spider') self.assertEqual(runner.bootstrap_failed, True) @defer.inlineCallbacks def test_crawler_runner_bootstrap_failed_for_several(self): runner = CrawlerRunner() try: yield runner.crawl(ExceptionSpider) except ValueError: pass else: self.fail('Exception should be raised from spider') yield runner.crawl(NoRequestsSpider) self.assertEqual(runner.bootstrap_failed, True)