Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
looyolo
scrapy
提交
5835224e
S
scrapy
项目概览
looyolo
/
scrapy
与 Fork 源项目一致
从无法访问的项目Fork
通知
2
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
S
scrapy
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
5835224e
编写于
10月 02, 2014
作者:
P
Pablo Hoffman
浏览文件
操作
浏览文件
下载
差异文件
Merge pull request #896 from scrapy/robotstxt-once
[MRG] process robots.txt once
上级
9e2c6043
36eec8f4
变更
4
隐藏空白更改
内联
并排
Showing
4 changed file
with
62 addition
and
22 deletion
+62
-22
docs/topics/downloader-middleware.rst
docs/topics/downloader-middleware.rst
+9
-1
docs/topics/request-response.rst
docs/topics/request-response.rst
+1
-0
scrapy/contrib/downloadermiddleware/robotstxt.py
scrapy/contrib/downloadermiddleware/robotstxt.py
+8
-5
tests/test_downloadermiddleware_robotstxt.py
tests/test_downloadermiddleware_robotstxt.py
+44
-16
未找到文件。
docs/topics/downloader-middleware.rst
浏览文件 @
5835224e
...
...
@@ -785,11 +785,19 @@ RobotsTxtMiddleware
and the :setting:`ROBOTSTXT_OBEY` setting is enabled.
.. warning:: Keep in mind that, if you crawl using multiple concurrent
requests per domain, Scrapy could still
download some forbidden pages
requests per domain, Scrapy could still download some forbidden pages
if they were requested before the robots.txt file was downloaded. This
is a known limitation of the current robots.txt middleware and will
be fixed in the future.
.. reqmeta:: dont_obey_robotstxt
If :attr:`Request.meta <scrapy.http.Request.meta>` has
``dont_obey_robotstxt`` key set to True
the request will be ignored by this middleware even if
:setting:`ROBOTSTXT_OBEY` is enabled.
DownloaderStats
---------------
...
...
docs/topics/request-response.rst
浏览文件 @
5835224e
...
...
@@ -228,6 +228,7 @@ Those are:
* :reqmeta:`cookiejar`
* :reqmeta:`redirect_urls`
* :reqmeta:`bindaddress`
* :reqmeta:`dont_obey_robotstxt`
.. reqmeta:: bindaddress
...
...
scrapy/contrib/downloadermiddleware/robotstxt.py
浏览文件 @
5835224e
...
...
@@ -22,16 +22,16 @@ class RobotsTxtMiddleware(object):
self
.
crawler
=
crawler
self
.
_useragent
=
crawler
.
settings
.
get
(
'USER_AGENT'
)
self
.
_parsers
=
{}
self
.
_spider_netlocs
=
set
()
@
classmethod
def
from_crawler
(
cls
,
crawler
):
return
cls
(
crawler
)
def
process_request
(
self
,
request
,
spider
):
useragent
=
self
.
_useragent
if
request
.
meta
.
get
(
'dont_obey_robotstxt'
):
return
rp
=
self
.
robot_parser
(
request
,
spider
)
if
rp
and
not
rp
.
can_fetch
(
useragent
,
request
.
url
):
if
rp
and
not
rp
.
can_fetch
(
self
.
_
useragent
,
request
.
url
):
log
.
msg
(
format
=
"Forbidden by robots.txt: %(request)s"
,
level
=
log
.
DEBUG
,
request
=
request
)
raise
IgnoreRequest
...
...
@@ -42,10 +42,13 @@ class RobotsTxtMiddleware(object):
if
netloc
not
in
self
.
_parsers
:
self
.
_parsers
[
netloc
]
=
None
robotsurl
=
"%s://%s/robots.txt"
%
(
url
.
scheme
,
url
.
netloc
)
robotsreq
=
Request
(
robotsurl
,
priority
=
self
.
DOWNLOAD_PRIORITY
)
robotsreq
=
Request
(
robotsurl
,
priority
=
self
.
DOWNLOAD_PRIORITY
,
meta
=
{
'dont_obey_robotstxt'
:
True
}
)
dfd
=
self
.
crawler
.
engine
.
download
(
robotsreq
,
spider
)
dfd
.
addCallback
(
self
.
_parse_robots
)
self
.
_spider_netlocs
.
add
(
netloc
)
return
self
.
_parsers
[
netloc
]
def
_parse_robots
(
self
,
response
):
...
...
tests/test_downloadermiddleware_robotstxt.py
浏览文件 @
5835224e
from
__future__
import
absolute_import
import
re
import
mock
from
twisted.internet
import
reactor
...
...
@@ -11,7 +12,44 @@ from scrapy.settings import Settings
class
RobotsTxtMiddlewareTest
(
unittest
.
TestCase
):
def
test
(
self
):
def
test_robotstxt
(
self
):
middleware
=
self
.
_get_middleware
()
# There is a bit of neglect in robotstxt.py: robots.txt is fetched asynchronously,
# and it is actually fetched only *after* first process_request completes.
# So, first process_request will always succeed.
# We defer test() because otherwise robots.txt download mock will be called after assertRaises failure.
self
.
assertNotIgnored
(
Request
(
'http://site.local'
),
middleware
)
def
test
(
r
):
self
.
assertNotIgnored
(
Request
(
'http://site.local/allowed'
),
middleware
)
self
.
assertIgnored
(
Request
(
'http://site.local/admin/main'
),
middleware
)
self
.
assertIgnored
(
Request
(
'http://site.local/static/'
),
middleware
)
deferred
=
Deferred
()
deferred
.
addCallback
(
test
)
reactor
.
callFromThread
(
deferred
.
callback
,
None
)
return
deferred
def
test_robotstxt_meta
(
self
):
meta
=
{
'dont_obey_robotstxt'
:
True
}
middleware
=
self
.
_get_middleware
()
self
.
assertNotIgnored
(
Request
(
'http://site.local'
,
meta
=
meta
),
middleware
)
def
test
(
r
):
self
.
assertNotIgnored
(
Request
(
'http://site.local/allowed'
,
meta
=
meta
),
middleware
)
self
.
assertNotIgnored
(
Request
(
'http://site.local/admin/main'
,
meta
=
meta
),
middleware
)
self
.
assertNotIgnored
(
Request
(
'http://site.local/static/'
,
meta
=
meta
),
middleware
)
deferred
=
Deferred
()
deferred
.
addCallback
(
test
)
reactor
.
callFromThread
(
deferred
.
callback
,
None
)
return
deferred
def
assertNotIgnored
(
self
,
request
,
middleware
):
spider
=
None
# not actually used
self
.
assertIsNone
(
middleware
.
process_request
(
request
,
spider
))
def
assertIgnored
(
self
,
request
,
middleware
):
spider
=
None
# not actually used
self
.
assertRaises
(
IgnoreRequest
,
middleware
.
process_request
,
request
,
spider
)
def
_get_crawler
(
self
):
crawler
=
mock
.
MagicMock
()
crawler
.
settings
=
Settings
()
crawler
.
settings
.
set
(
'USER_AGENT'
,
'CustomAgent'
)
...
...
@@ -29,18 +67,8 @@ class RobotsTxtMiddlewareTest(unittest.TestCase):
reactor
.
callFromThread
(
deferred
.
callback
,
response
)
return
deferred
crawler
.
engine
.
download
.
side_effect
=
return_response
middleware
=
RobotsTxtMiddleware
(
crawler
)
spider
=
None
# not actually used
# There is a bit of neglect in robotstxt.py: robots.txt is fetched asynchronously,
# and it is actually fetched only *after* first process_request completes.
# So, first process_request will always succeed.
# We defer test() because otherwise robots.txt download mock will be called after assertRaises failure.
self
.
assertIsNone
(
middleware
.
process_request
(
Request
(
'http://site.local'
),
spider
))
# not affected by robots.txt
def
test
(
r
):
self
.
assertIsNone
(
middleware
.
process_request
(
Request
(
'http://site.local/allowed'
),
spider
))
self
.
assertRaises
(
IgnoreRequest
,
middleware
.
process_request
,
Request
(
'http://site.local/admin/main'
),
spider
)
self
.
assertRaises
(
IgnoreRequest
,
middleware
.
process_request
,
Request
(
'http://site.local/static/'
),
spider
)
deferred
=
Deferred
()
deferred
.
addCallback
(
test
)
reactor
.
callFromThread
(
deferred
.
callback
,
None
)
return
deferred
return
crawler
def
_get_middleware
(
self
):
crawler
=
self
.
_get_crawler
()
return
RobotsTxtMiddleware
(
crawler
)
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录