Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
looyolo
scrapy
提交
0e7ee125
S
scrapy
项目概览
looyolo
/
scrapy
与 Fork 源项目一致
从无法访问的项目Fork
通知
2
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
S
scrapy
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
前往新版Gitcode,体验更适合开发者的 AI 搜索 >>
未验证
提交
0e7ee125
编写于
3月 20, 2021
作者:
M
Mikhail Korobov
提交者:
GitHub
3月 20, 2021
浏览文件
操作
浏览文件
下载
差异文件
Merge pull request #5036 from dmiwell/urllength-dont-skip-silently
UrlLengthMiddleware: don't skip silently
上级
308a58aa
9cc4513b
变更
2
隐藏空白更改
内联
并排
Showing
2 changed file
with
36 addition
and
12 deletion
+36
-12
scrapy/spidermiddlewares/urllength.py
scrapy/spidermiddlewares/urllength.py
+6
-3
tests/test_spidermiddleware_urllength.py
tests/test_spidermiddleware_urllength.py
+30
-9
未找到文件。
scrapy/spidermiddlewares/urllength.py
浏览文件 @
0e7ee125
...
...
@@ -27,9 +27,12 @@ class UrlLengthMiddleware:
def
process_spider_output
(
self
,
response
,
result
,
spider
):
def
_filter
(
request
):
if
isinstance
(
request
,
Request
)
and
len
(
request
.
url
)
>
self
.
maxlength
:
logger
.
debug
(
"Ignoring link (url length > %(maxlength)d): %(url)s "
,
{
'maxlength'
:
self
.
maxlength
,
'url'
:
request
.
url
},
extra
=
{
'spider'
:
spider
})
logger
.
info
(
"Ignoring link (url length > %(maxlength)d): %(url)s "
,
{
'maxlength'
:
self
.
maxlength
,
'url'
:
request
.
url
},
extra
=
{
'spider'
:
spider
}
)
spider
.
crawler
.
stats
.
inc_value
(
'urllength/request_ignored_count'
,
spider
=
spider
)
return
False
else
:
return
True
...
...
tests/test_spidermiddleware_urllength.py
浏览文件 @
0e7ee125
from
unittest
import
TestCase
from
testfixtures
import
LogCapture
from
scrapy.spidermiddlewares.urllength
import
UrlLengthMiddleware
from
scrapy.http
import
Response
,
Request
from
scrapy.spiders
import
Spider
from
scrapy.utils.test
import
get_crawler
from
scrapy.settings
import
Settings
class
TestUrlLengthMiddleware
(
TestCase
):
def
test_process_spider_output
(
self
):
res
=
Response
(
'http://scrapytest.org'
)
def
setUp
(
self
):
self
.
maxlength
=
25
settings
=
Settings
({
'URLLENGTH_LIMIT'
:
self
.
maxlength
})
crawler
=
get_crawler
(
Spider
)
self
.
spider
=
crawler
.
_create_spider
(
'foo'
)
self
.
stats
=
crawler
.
stats
self
.
mw
=
UrlLengthMiddleware
.
from_settings
(
settings
)
self
.
response
=
Response
(
'http://scrapytest.org'
)
self
.
short_url_req
=
Request
(
'http://scrapytest.org/'
)
self
.
long_url_req
=
Request
(
'http://scrapytest.org/this_is_a_long_url'
)
self
.
reqs
=
[
self
.
short_url_req
,
self
.
long_url_req
]
def
process_spider_output
(
self
):
return
list
(
self
.
mw
.
process_spider_output
(
self
.
response
,
self
.
reqs
,
self
.
spider
))
def
test_middleware_works
(
self
):
self
.
assertEqual
(
self
.
process_spider_output
(),
[
self
.
short_url_req
])
def
test_logging
(
self
):
with
LogCapture
()
as
log
:
self
.
process_spider_output
()
short_url_req
=
Request
(
'http://scrapytest.org/'
)
long_url_req
=
Request
(
'http://scrapytest.org/this_is_a_long_url'
)
reqs
=
[
short_url_req
,
long_url_req
]
ric
=
self
.
stats
.
get_value
(
'urllength/request_ignored_count'
,
spider
=
self
.
spider
)
self
.
assertEqual
(
ric
,
1
)
mw
=
UrlLengthMiddleware
(
maxlength
=
25
)
spider
=
Spider
(
'foo'
)
out
=
list
(
mw
.
process_spider_output
(
res
,
reqs
,
spider
))
self
.
assertEqual
(
out
,
[
short_url_req
])
self
.
assertIn
(
f
'Ignoring link (url length >
{
self
.
maxlength
}
)'
,
str
(
log
))
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录