提交 870438e5 编写于 作者: J Julia Medina

Update tests utils, fixing get_crawler and removing docrawl

上级 d4027356
......@@ -20,15 +20,17 @@ def assert_aws_environ():
if 'AWS_ACCESS_KEY_ID' not in os.environ:
raise SkipTest("AWS keys not found")
def get_crawler(settings_dict=None):
def get_crawler(spidercls=None, settings_dict=None):
"""Return an unconfigured Crawler object. If settings_dict is given, it
will be used to populate the crawler settings with a project level
priority.
"""
from scrapy.crawler import Crawler
from scrapy.crawler import CrawlerRunner
from scrapy.settings import Settings
from scrapy.spider import Spider
return Crawler(Settings(settings_dict))
runner = CrawlerRunner(Settings(settings_dict))
return runner._create_crawler(spidercls or Spider)
def get_pythonpath():
"""Return a PYTHONPATH suitable to use in processes so that they find this
......@@ -62,10 +64,3 @@ def assert_samelines(testcase, text1, text2, msg=None):
line endings between platforms
"""
testcase.assertEqual(text1.splitlines(), text2.splitlines(), msg)
def docrawl(spider, settings=None):
"""Configure and start Crawler; return the result of crawler.start()"""
crawler = get_crawler(settings)
crawler.configure()
crawler.crawl(spider)
return crawler.start()
......@@ -14,6 +14,7 @@ tests/test_downloadermiddleware_ajaxcrawlable.py
tests/test_downloadermiddleware_cookies.py
tests/test_downloadermiddleware_decompression.py
tests/test_downloadermiddleware_defaultheaders.py
tests/test_downloadermiddleware_downloadtimeout.py
tests/test_downloadermiddleware_httpauth.py
tests/test_downloadermiddleware_httpcache.py
tests/test_downloadermiddleware_httpcompression.py
......@@ -22,6 +23,7 @@ tests/test_downloadermiddleware.py
tests/test_downloadermiddleware_redirect.py
tests/test_downloadermiddleware_retry.py
tests/test_downloadermiddleware_robotstxt.py
tests/test_downloadermiddleware_stats.py
tests/test_downloadermiddleware_useragent.py
tests/test_dupefilter.py
tests/test_engine.py
......@@ -48,9 +50,12 @@ tests/test_spidermanager/test_spiders/spider1.py
tests/test_spidermanager/test_spiders/spider2.py
tests/test_spidermanager/test_spiders/spider3.py
tests/test_spidermanager/test_spiders/spider4.py
tests/test_spidermiddleware_depth.py
tests/test_spidermiddleware_httperror.py
tests/test_spidermiddleware_offsite.py
tests/test_spidermiddleware_referer.py
tests/test_spider.py
tests/test_stats.py
tests/test_utils_defer.py
tests/test_utils_iterators.py
tests/test_utils_jsonrpc.py
......
from twisted.internet import defer
from twisted.trial.unittest import TestCase
from scrapy.utils.test import docrawl
from scrapy.utils.test import get_crawler
from tests.spiders import FollowAllSpider, ItemSpider, ErrorSpider
from tests.mockserver import MockServer
......@@ -16,45 +16,45 @@ class TestCloseSpider(TestCase):
@defer.inlineCallbacks
def test_closespider_itemcount(self):
spider = ItemSpider()
close_on = 5
yield docrawl(spider, {'CLOSESPIDER_ITEMCOUNT': close_on})
reason = spider.meta['close_reason']
crawler = get_crawler(ItemSpider, {'CLOSESPIDER_ITEMCOUNT': close_on})
yield crawler.crawl()
reason = crawler.spider.meta['close_reason']
self.assertEqual(reason, 'closespider_itemcount')
itemcount = spider.crawler.stats.get_value('item_scraped_count')
itemcount = crawler.stats.get_value('item_scraped_count')
self.assertTrue(itemcount >= close_on)
@defer.inlineCallbacks
def test_closespider_pagecount(self):
spider = FollowAllSpider()
close_on = 5
yield docrawl(spider, {'CLOSESPIDER_PAGECOUNT': close_on})
reason = spider.meta['close_reason']
crawler = get_crawler(FollowAllSpider, {'CLOSESPIDER_PAGECOUNT': close_on})
yield crawler.crawl()
reason = crawler.spider.meta['close_reason']
self.assertEqual(reason, 'closespider_pagecount')
pagecount = spider.crawler.stats.get_value('response_received_count')
pagecount = crawler.stats.get_value('response_received_count')
self.assertTrue(pagecount >= close_on)
@defer.inlineCallbacks
def test_closespider_errorcount(self):
spider = ErrorSpider(total=1000000)
close_on = 5
yield docrawl(spider, {'CLOSESPIDER_ERRORCOUNT': close_on})
self.flushLoggedErrors(spider.exception_cls)
reason = spider.meta['close_reason']
crawler = get_crawler(ErrorSpider, {'CLOSESPIDER_ERRORCOUNT': close_on})
yield crawler.crawl(total=1000000)
self.flushLoggedErrors(crawler.spider.exception_cls)
reason = crawler.spider.meta['close_reason']
self.assertEqual(reason, 'closespider_errorcount')
key = 'spider_exceptions/{name}'\
.format(name=spider.exception_cls.__name__)
errorcount = spider.crawler.stats.get_value(key)
.format(name=crawler.spider.exception_cls.__name__)
errorcount = crawler.stats.get_value(key)
self.assertTrue(errorcount >= close_on)
@defer.inlineCallbacks
def test_closespider_timeout(self):
spider = FollowAllSpider(total=1000000)
close_on = 0.1
yield docrawl(spider, {'CLOSESPIDER_TIMEOUT': close_on})
reason = spider.meta['close_reason']
crawler = get_crawler(FollowAllSpider, {'CLOSESPIDER_TIMEOUT': close_on})
yield crawler.crawl(total=1000000)
reason = crawler.spider.meta['close_reason']
self.assertEqual(reason, 'closespider_timeout')
stats = spider.crawler.stats
stats = crawler.stats
start = stats.get_value('start_time')
stop = stats.get_value('finish_time')
diff = stop - start
......
......@@ -3,7 +3,7 @@ import socket
import mock
from twisted.internet import defer
from twisted.trial.unittest import TestCase
from scrapy.utils.test import docrawl, get_testlog
from scrapy.utils.test import get_crawler, get_testlog
from tests.spiders import FollowAllSpider, DelaySpider, SimpleSpider, \
BrokenStartRequestsSpider, SingleRequestSpider, DuplicateStartRequestsSpider
from tests.mockserver import MockServer
......@@ -21,9 +21,9 @@ class CrawlTestCase(TestCase):
@defer.inlineCallbacks
def test_follow_all(self):
spider = FollowAllSpider()
yield docrawl(spider)
self.assertEqual(len(spider.urls_visited), 11) # 10 + start_url
crawler = get_crawler(FollowAllSpider)
yield crawler.crawl()
self.assertEqual(len(crawler.spider.urls_visited), 11) # 10 + start_url
@defer.inlineCallbacks
def test_delay(self):
......@@ -37,9 +37,9 @@ class CrawlTestCase(TestCase):
@defer.inlineCallbacks
def _test_delay(self, delay, randomize):
settings = {"DOWNLOAD_DELAY": delay, 'RANDOMIZE_DOWNLOAD_DELAY': randomize}
spider = FollowAllSpider(maxlatency=delay * 2)
yield docrawl(spider, settings)
t = spider.times
crawler = get_crawler(FollowAllSpider, settings)
yield crawler.crawl(maxlatency=delay * 2)
t = crawler.spider.times
totaltime = t[-1] - t[0]
avgd = totaltime / (len(t) - 1)
tolerance = 0.6 if randomize else 0.2
......@@ -48,85 +48,79 @@ class CrawlTestCase(TestCase):
@defer.inlineCallbacks
def test_timeout_success(self):
spider = DelaySpider(n=0.5)
yield docrawl(spider)
self.assertTrue(spider.t1 > 0)
self.assertTrue(spider.t2 > 0)
self.assertTrue(spider.t2 > spider.t1)
crawler = get_crawler(DelaySpider)
yield crawler.crawl(n=0.5)
self.assertTrue(crawler.spider.t1 > 0)
self.assertTrue(crawler.spider.t2 > 0)
self.assertTrue(crawler.spider.t2 > crawler.spider.t1)
@defer.inlineCallbacks
def test_timeout_failure(self):
spider = DelaySpider(n=0.5)
yield docrawl(spider, {"DOWNLOAD_TIMEOUT": 0.35})
self.assertTrue(spider.t1 > 0)
self.assertTrue(spider.t2 == 0)
self.assertTrue(spider.t2_err > 0)
self.assertTrue(spider.t2_err > spider.t1)
crawler = get_crawler(DelaySpider, {"DOWNLOAD_TIMEOUT": 0.35})
yield crawler.crawl(n=0.5)
self.assertTrue(crawler.spider.t1 > 0)
self.assertTrue(crawler.spider.t2 == 0)
self.assertTrue(crawler.spider.t2_err > 0)
self.assertTrue(crawler.spider.t2_err > crawler.spider.t1)
# server hangs after receiving response headers
spider = DelaySpider(n=0.5, b=1)
yield docrawl(spider, {"DOWNLOAD_TIMEOUT": 0.35})
self.assertTrue(spider.t1 > 0)
self.assertTrue(spider.t2 == 0)
self.assertTrue(spider.t2_err > 0)
self.assertTrue(spider.t2_err > spider.t1)
yield crawler.crawl(n=0.5, b=1)
self.assertTrue(crawler.spider.t1 > 0)
self.assertTrue(crawler.spider.t2 == 0)
self.assertTrue(crawler.spider.t2_err > 0)
self.assertTrue(crawler.spider.t2_err > crawler.spider.t1)
@defer.inlineCallbacks
def test_retry_503(self):
spider = SimpleSpider("http://localhost:8998/status?n=503")
yield docrawl(spider)
crawler = get_crawler(SimpleSpider)
yield crawler.crawl("http://localhost:8998/status?n=503")
self._assert_retried()
@defer.inlineCallbacks
def test_retry_conn_failed(self):
spider = SimpleSpider("http://localhost:65432/status?n=503")
yield docrawl(spider)
crawler = get_crawler(SimpleSpider)
yield crawler.crawl("http://localhost:65432/status?n=503")
self._assert_retried()
@defer.inlineCallbacks
def test_retry_dns_error(self):
with mock.patch('socket.gethostbyname',
side_effect=socket.gaierror(-5, 'No address associated with hostname')):
spider = SimpleSpider("http://example.com/")
yield docrawl(spider)
crawler = get_crawler(SimpleSpider)
yield crawler.crawl("http://example.com/")
self._assert_retried()
@defer.inlineCallbacks
def test_start_requests_bug_before_yield(self):
spider = BrokenStartRequestsSpider(fail_before_yield=1)
yield docrawl(spider)
crawler = get_crawler(BrokenStartRequestsSpider)
yield crawler.crawl(fail_before_yield=1)
errors = self.flushLoggedErrors(ZeroDivisionError)
self.assertEqual(len(errors), 1)
@defer.inlineCallbacks
def test_start_requests_bug_yielding(self):
spider = BrokenStartRequestsSpider(fail_yielding=1)
yield docrawl(spider)
crawler = get_crawler(BrokenStartRequestsSpider)
yield crawler.crawl(fail_yielding=1)
errors = self.flushLoggedErrors(ZeroDivisionError)
self.assertEqual(len(errors), 1)
@defer.inlineCallbacks
def test_start_requests_lazyness(self):
settings = {"CONCURRENT_REQUESTS": 1}
spider = BrokenStartRequestsSpider()
yield docrawl(spider, settings)
#self.assertTrue(False, spider.seedsseen)
#self.assertTrue(spider.seedsseen.index(None) < spider.seedsseen.index(99),
# spider.seedsseen)
crawler = get_crawler(BrokenStartRequestsSpider, settings)
yield crawler.crawl()
#self.assertTrue(False, crawler.spider.seedsseen)
#self.assertTrue(crawler.spider.seedsseen.index(None) < crawler.spider.seedsseen.index(99),
# crawler.spider.seedsseen)
@defer.inlineCallbacks
def test_start_requests_dupes(self):
settings = {"CONCURRENT_REQUESTS": 1}
spider = DuplicateStartRequestsSpider(dont_filter=True,
distinct_urls=2,
dupe_factor=3)
yield docrawl(spider, settings)
self.assertEqual(spider.visited, 6)
crawler = get_crawler(DuplicateStartRequestsSpider, settings)
yield crawler.crawl(dont_filter=True, distinct_urls=2, dupe_factor=3)
self.assertEqual(crawler.spider.visited, 6)
spider = DuplicateStartRequestsSpider(dont_filter=False,
distinct_urls=3,
dupe_factor=4)
yield docrawl(spider, settings)
self.assertEqual(spider.visited, 3)
yield crawler.crawl(dont_filter=False, distinct_urls=3, dupe_factor=4)
self.assertEqual(crawler.spider.visited, 3)
@defer.inlineCallbacks
def test_unbounded_response(self):
......@@ -150,23 +144,23 @@ Connection: close
foo body
with multiples lines
'''})
spider = SimpleSpider("http://localhost:8998/raw?{0}".format(query))
yield docrawl(spider)
crawler = get_crawler(SimpleSpider)
yield crawler.crawl("http://localhost:8998/raw?{0}".format(query))
log = get_testlog()
self.assertEqual(log.count("Got response 200"), 1)
@defer.inlineCallbacks
def test_retry_conn_lost(self):
# connection lost after receiving data
spider = SimpleSpider("http://localhost:8998/drop?abort=0")
yield docrawl(spider)
crawler = get_crawler(SimpleSpider)
yield crawler.crawl("http://localhost:8998/drop?abort=0")
self._assert_retried()
@defer.inlineCallbacks
def test_retry_conn_aborted(self):
# connection lost before receiving data
spider = SimpleSpider("http://localhost:8998/drop?abort=1")
yield docrawl(spider)
crawler = get_crawler(SimpleSpider)
yield crawler.crawl("http://localhost:8998/drop?abort=1")
self._assert_retried()
def _assert_retried(self):
......@@ -184,22 +178,22 @@ with multiples lines
req0.meta['next'] = req1
req1.meta['next'] = req2
req2.meta['next'] = req3
spider = SingleRequestSpider(seed=req0)
yield docrawl(spider)
crawler = get_crawler(SingleRequestSpider)
yield crawler.crawl(seed=req0)
# basic asserts in case of weird communication errors
self.assertIn('responses', spider.meta)
self.assertNotIn('failures', spider.meta)
self.assertIn('responses', crawler.spider.meta)
self.assertNotIn('failures', crawler.spider.meta)
# start requests doesn't set Referer header
echo0 = json.loads(spider.meta['responses'][2].body)
echo0 = json.loads(crawler.spider.meta['responses'][2].body)
self.assertNotIn('Referer', echo0['headers'])
# following request sets Referer to start request url
echo1 = json.loads(spider.meta['responses'][1].body)
echo1 = json.loads(crawler.spider.meta['responses'][1].body)
self.assertEqual(echo1['headers'].get('Referer'), [req0.url])
# next request avoids Referer header
echo2 = json.loads(spider.meta['responses'][2].body)
echo2 = json.loads(crawler.spider.meta['responses'][2].body)
self.assertNotIn('Referer', echo2['headers'])
# last request explicitly sets a Referer header
echo3 = json.loads(spider.meta['responses'][3].body)
echo3 = json.loads(crawler.spider.meta['responses'][3].body)
self.assertEqual(echo3['headers'].get('Referer'), ['http://example.com'])
@defer.inlineCallbacks
......@@ -208,11 +202,11 @@ with multiples lines
est = []
def cb(response):
est.append(get_engine_status(spider.crawler.engine))
est.append(get_engine_status(crawler.engine))
spider = SingleRequestSpider(seed='http://localhost:8998/', callback_func=cb)
yield docrawl(spider)
crawler = get_crawler(SingleRequestSpider)
yield crawler.crawl(seed='http://localhost:8998/', callback_func=cb)
self.assertEqual(len(est), 1, est)
s = dict(est[0])
self.assertEqual(s['engine.spider.name'], spider.name)
self.assertEqual(s['engine.spider.name'], crawler.spider.name)
self.assertEqual(s['len(engine.scraper.slot.active)'], 1)
......@@ -47,19 +47,22 @@ class LoadTestCase(unittest.TestCase):
def test_enabled_handler(self):
handlers = {'scheme': 'tests.test_downloader_handlers.DummyDH'}
dh = DownloadHandlers(get_crawler({'DOWNLOAD_HANDLERS': handlers}))
crawler = get_crawler(settings_dict={'DOWNLOAD_HANDLERS': handlers})
dh = DownloadHandlers(crawler)
self.assertIn('scheme', dh._handlers)
self.assertNotIn('scheme', dh._notconfigured)
def test_not_configured_handler(self):
handlers = {'scheme': 'tests.test_downloader_handlers.OffDH'}
dh = DownloadHandlers(get_crawler({'DOWNLOAD_HANDLERS': handlers}))
crawler = get_crawler(settings_dict={'DOWNLOAD_HANDLERS': handlers})
dh = DownloadHandlers(crawler)
self.assertNotIn('scheme', dh._handlers)
self.assertIn('scheme', dh._notconfigured)
def test_disabled_handler(self):
handlers = {'scheme': None}
dh = DownloadHandlers(get_crawler({'DOWNLOAD_HANDLERS': handlers}))
crawler = get_crawler(settings_dict={'DOWNLOAD_HANDLERS': handlers})
dh = DownloadHandlers(crawler)
self.assertNotIn('scheme', dh._handlers)
self.assertNotIn('scheme', dh._notconfigured)
......
......@@ -12,9 +12,8 @@ class ManagerTestCase(TestCase):
settings_dict = None
def setUp(self):
self.crawler = get_crawler(self.settings_dict)
self.spider = Spider('foo')
self.spider.set_crawler(self.crawler)
self.crawler = get_crawler(Spider, self.settings_dict)
self.spider = self.crawler._create_spider('foo')
self.mwman = DownloaderMiddlewareManager.from_crawler(self.crawler)
# some mw depends on stats collector
self.crawler.stats.open_spider(self.spider)
......
......@@ -9,8 +9,8 @@ __doctests__ = ['scrapy.contrib.downloadermiddleware.ajaxcrawl']
class AjaxCrawlMiddlewareTest(unittest.TestCase):
def setUp(self):
self.spider = Spider('foo')
crawler = get_crawler({'AJAXCRAWL_ENABLED': True})
crawler = get_crawler(Spider, {'AJAXCRAWL_ENABLED': True})
self.spider = crawler._create_spider('foo')
self.mw = AjaxCrawlMiddleware.from_crawler(crawler)
def _ajaxcrawlable_body(self):
......
......@@ -10,9 +10,8 @@ from scrapy.utils.test import get_crawler
class TestDefaultHeadersMiddleware(TestCase):
def get_defaults_spider_mw(self):
crawler = get_crawler()
spider = Spider('foo')
spider.set_crawler(crawler)
crawler = get_crawler(Spider)
spider = crawler._create_spider('foo')
defaults = dict([(k, [v]) for k, v in \
six.iteritems(crawler.settings.get('DEFAULT_REQUEST_HEADERS'))])
return defaults, spider, DefaultHeadersMiddleware.from_crawler(crawler)
......
......@@ -9,9 +9,8 @@ from scrapy.utils.test import get_crawler
class DownloadTimeoutMiddlewareTest(unittest.TestCase):
def get_request_spider_mw(self):
crawler = get_crawler()
spider = Spider('foo')
spider.set_crawler(crawler)
crawler = get_crawler(Spider)
spider = crawler._create_spider('foo')
request = Request('http://scrapytest.org/')
return request, spider, DownloadTimeoutMiddleware.from_crawler(crawler)
......
......@@ -24,8 +24,8 @@ class _BaseTest(unittest.TestCase):
self.yesterday = email.utils.formatdate(time.time() - 86400)
self.today = email.utils.formatdate()
self.tomorrow = email.utils.formatdate(time.time() + 86400)
self.crawler = get_crawler()
self.spider = Spider('example.com')
self.crawler = get_crawler(Spider)
self.spider = self.crawler._create_spider('example.com')
self.tmpdir = tempfile.mkdtemp()
self.request = Request('http://www.example.com',
headers={'User-Agent': 'test'})
......
......@@ -10,8 +10,8 @@ from scrapy.utils.test import get_crawler
class RedirectMiddlewareTest(unittest.TestCase):
def setUp(self):
crawler = get_crawler()
self.spider = Spider('foo')
crawler = get_crawler(Spider)
self.spider = crawler._create_spider('foo')
self.mw = RedirectMiddleware.from_crawler(crawler)
def test_priority_adjust(self):
......@@ -123,8 +123,8 @@ class RedirectMiddlewareTest(unittest.TestCase):
class MetaRefreshMiddlewareTest(unittest.TestCase):
def setUp(self):
crawler = get_crawler()
self.spider = Spider('foo')
crawler = get_crawler(Spider)
self.spider = crawler._create_spider('foo')
self.mw = MetaRefreshMiddleware.from_crawler(crawler)
def _body(self, interval=5, url='http://example.org/newpage'):
......
......@@ -14,8 +14,8 @@ from scrapy.utils.test import get_crawler
class RetryTest(unittest.TestCase):
def setUp(self):
crawler = get_crawler()
self.spider = Spider('foo')
crawler = get_crawler(Spider)
self.spider = crawler._create_spider('foo')
self.mw = RetryMiddleware.from_crawler(crawler)
self.mw.max_retry_times = 2
......
......@@ -9,8 +9,8 @@ from scrapy.utils.test import get_crawler
class TestDownloaderStats(TestCase):
def setUp(self):
self.crawler = get_crawler()
self.spider = Spider('scrapytest.org')
self.crawler = get_crawler(Spider)
self.spider = self.crawler._create_spider('scrapytest.org')
self.mw = DownloaderStats(self.crawler.stats)
self.crawler.stats.open_spider(self.spider)
......
......@@ -9,9 +9,8 @@ from scrapy.utils.test import get_crawler
class UserAgentMiddlewareTest(TestCase):
def get_spider_and_mw(self, default_useragent):
crawler = get_crawler({'USER_AGENT': default_useragent})
spider = Spider('foo')
spider.set_crawler(crawler)
crawler = get_crawler(Spider, {'USER_AGENT': default_useragent})
spider = crawler._create_spider('foo')
return spider, UserAgentMiddleware.from_crawler(crawler)
def test_default_agent(self):
......
......@@ -87,20 +87,18 @@ class CrawlerRun(object):
self.portno = self.port.getHost().port
start_urls = [self.geturl("/"), self.geturl("/redirect")]
self.spider = TestSpider(start_urls=start_urls)
for name, signal in vars(signals).items():
if not name.startswith('_'):
dispatcher.connect(self.record_signal, signal)
self.crawler = get_crawler()
self.crawler = get_crawler(TestSpider)
self.crawler.install()
self.crawler.configure()
self.crawler.signals.connect(self.item_scraped, signals.item_scraped)
self.crawler.signals.connect(self.request_scheduled, signals.request_scheduled)
self.crawler.signals.connect(self.response_downloaded, signals.response_downloaded)
self.crawler.crawl(self.spider)
self.crawler.start()
self.crawler.crawl(start_urls=start_urls)
self.spider = self.crawler.spider
self.deferred = defer.Deferred()
dispatcher.connect(self.stop, signals.engine_stopped)
......
......@@ -8,7 +8,7 @@ from netlib import http_auth
from twisted.internet import defer
from twisted.trial.unittest import TestCase
from scrapy.utils.test import get_testlog, docrawl
from scrapy.utils.test import get_testlog, get_crawler
from scrapy.http import Request
from tests.spiders import SimpleSpider, SingleRequestSpider
from tests.mockserver import MockServer
......@@ -49,29 +49,29 @@ class ProxyConnectTestCase(TestCase):
@defer.inlineCallbacks
def test_https_connect_tunnel(self):
spider = SimpleSpider("https://localhost:8999/status?n=200")
yield docrawl(spider)
crawler = get_crawler(SimpleSpider)
yield crawler.crawl("https://localhost:8999/status?n=200")
self._assert_got_response_code(200)
@defer.inlineCallbacks
def test_https_noconnect(self):
os.environ['https_proxy'] = 'http://scrapy:scrapy@localhost:8888?noconnect'
spider = SimpleSpider("https://localhost:8999/status?n=200")
yield docrawl(spider)
crawler = get_crawler(SimpleSpider)
yield crawler.crawl("https://localhost:8999/status?n=200")
self._assert_got_response_code(200)
os.environ['https_proxy'] = 'http://scrapy:scrapy@localhost:8888'
@defer.inlineCallbacks
def test_https_connect_tunnel_error(self):
spider = SimpleSpider("https://localhost:99999/status?n=200")
yield docrawl(spider)
crawler = get_crawler(SimpleSpider)
yield crawler.crawl("https://localhost:99999/status?n=200")
self._assert_got_tunnel_error()
@defer.inlineCallbacks
def test_https_tunnel_auth_error(self):
os.environ['https_proxy'] = 'http://wrong:wronger@localhost:8888'
spider = SimpleSpider("https://localhost:8999/status?n=200")
yield docrawl(spider)
crawler = get_crawler(SimpleSpider)
yield crawler.crawl("https://localhost:8999/status?n=200")
# The proxy returns a 407 error code but it does not reach the client;
# he just sees a TunnelError.
self._assert_got_tunnel_error()
......@@ -80,17 +80,17 @@ class ProxyConnectTestCase(TestCase):
@defer.inlineCallbacks
def test_https_tunnel_without_leak_proxy_authorization_header(self):
request = Request("https://localhost:8999/echo")
spider = SingleRequestSpider(seed=request)
yield docrawl(spider)
crawler = get_crawler(SingleRequestSpider)
yield crawler.crawl(seed=request)
self._assert_got_response_code(200)
echo = json.loads(spider.meta['responses'][0].body)
echo = json.loads(crawler.spider.meta['responses'][0].body)
self.assertTrue('Proxy-Authorization' not in echo['headers'])
@defer.inlineCallbacks
def test_https_noconnect_auth_error(self):
os.environ['https_proxy'] = 'http://wrong:wronger@localhost:8888?noconnect'
spider = SimpleSpider("https://localhost:8999/status?n=200")
yield docrawl(spider)
crawler = get_crawler(SimpleSpider)
yield crawler.crawl("https://localhost:8999/status?n=200")
self._assert_got_response_code(407)
def _assert_got_response_code(self, code):
......
......@@ -242,7 +242,8 @@ class CrawlSpiderTest(SpiderTest):
self.assertTrue(hasattr(spider, '_follow_links'))
self.assertTrue(spider._follow_links)
crawler.settings.set('CRAWLSPIDER_FOLLOW_LINKS', False)
settings_dict = {'CRAWLSPIDER_FOLLOW_LINKS': False}
crawler = get_crawler(settings_dict=settings_dict)
spider = self.spider_class.from_crawler(crawler, 'example.com')
self.assertTrue(hasattr(spider, '_follow_links'))
self.assertFalse(spider._follow_links)
......@@ -256,7 +257,8 @@ class CrawlSpiderTest(SpiderTest):
self.assertTrue(spider._follow_links)
spider = self.spider_class('example.com')
spider.set_crawler(get_crawler({'CRAWLSPIDER_FOLLOW_LINKS': False}))
settings_dict = {'CRAWLSPIDER_FOLLOW_LINKS': False}
spider.set_crawler(get_crawler(settings_dict=settings_dict))
self.assertTrue(hasattr(spider, '_follow_links'))
self.assertFalse(spider._follow_links)
......
......@@ -10,9 +10,10 @@ from scrapy.utils.test import get_crawler
class TestDepthMiddleware(TestCase):
def setUp(self):
self.spider = Spider('scrapytest.org')
crawler = get_crawler(Spider)
self.spider = crawler._create_spider('scrapytest.org')
self.stats = StatsCollector(get_crawler())
self.stats = StatsCollector(crawler)
self.stats.open_spider(self.spider)
self.mw = DepthMiddleware(1, self.stats, True)
......
......@@ -3,7 +3,7 @@ from unittest import TestCase
from twisted.trial.unittest import TestCase as TrialTestCase
from twisted.internet import defer
from scrapy.utils.test import docrawl, get_testlog
from scrapy.utils.test import get_crawler, get_testlog
from tests.mockserver import MockServer
from scrapy.http import Response, Request
from scrapy.spider import Spider
......@@ -165,20 +165,20 @@ class TestHttpErrorMiddlewareIntegrational(TrialTestCase):
@defer.inlineCallbacks
def test_middleware_works(self):
spider = _HttpErrorSpider()
yield docrawl(spider)
assert not spider.skipped, spider.skipped
self.assertEqual(spider.parsed, {'200'})
self.assertEqual(spider.failed, {'404', '402', '500'})
crawler = get_crawler(_HttpErrorSpider)
yield crawler.crawl()
assert not crawler.spider.skipped, crawler.spider.skipped
self.assertEqual(crawler.spider.parsed, {'200'})
self.assertEqual(crawler.spider.failed, {'404', '402', '500'})
@defer.inlineCallbacks
def test_logging(self):
spider = _HttpErrorSpider(bypass_status_codes={402})
yield docrawl(spider)
crawler = get_crawler(_HttpErrorSpider)
yield crawler.crawl(bypass_status_codes={402})
# print(get_testlog())
self.assertEqual(spider.parsed, {'200', '402'})
self.assertEqual(spider.skipped, {'402'})
self.assertEqual(spider.failed, {'404', '500'})
self.assertEqual(crawler.spider.parsed, {'200', '402'})
self.assertEqual(crawler.spider.skipped, {'402'})
self.assertEqual(crawler.spider.failed, {'404', '500'})
log = get_testlog()
self.assertIn('Ignoring response <404', log)
......
......@@ -10,13 +10,13 @@ from scrapy.utils.test import get_crawler
class TestOffsiteMiddleware(TestCase):
def setUp(self):
self.spider = self._get_spider()
crawler = get_crawler()
crawler = get_crawler(Spider)
self.spider = crawler._create_spider(**self._get_spiderargs())
self.mw = OffsiteMiddleware.from_crawler(crawler)
self.mw.spider_opened(self.spider)
def _get_spider(self):
return Spider('foo', allowed_domains=['scrapytest.org', 'scrapy.org'])
def _get_spiderargs(self):
return dict(name='foo', allowed_domains=['scrapytest.org', 'scrapy.org'])
def test_process_spider_output(self):
res = Response('http://scrapytest.org')
......@@ -39,8 +39,8 @@ class TestOffsiteMiddleware(TestCase):
class TestOffsiteMiddleware2(TestOffsiteMiddleware):
def _get_spider(self):
return Spider('foo', allowed_domains=None)
def _get_spiderargs(self):
return dict(name='foo', allowed_domains=None)
def test_process_spider_output(self):
res = Response('http://scrapytest.org')
......@@ -58,7 +58,7 @@ class TestOffsiteMiddleware4(TestOffsiteMiddleware3):
def _get_spider(self):
bad_hostname = urlparse('http:////scrapytest.org').hostname
return Spider('foo', allowed_domains=['scrapytest.org', None, bad_hostname])
return dict(name='foo', allowed_domains=['scrapytest.org', None, bad_hostname])
def test_process_spider_output(self):
res = Response('http://scrapytest.org')
......
......@@ -7,8 +7,8 @@ from scrapy.utils.test import get_crawler
class StatsCollectorTest(unittest.TestCase):
def setUp(self):
self.crawler = get_crawler()
self.spider = Spider('foo')
self.crawler = get_crawler(Spider)
self.spider = self.crawler._create_spider('foo')
def test_collector(self):
stats = StatsCollector(self.crawler)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册