...
 
Commits (4)
    https://gitcode.net/OpenDocCN/epub-crawler/-/commit/0f000b5df40172a71c713ac01bf88f01ed940c86 2022-05-30 16:44:19 2022-05-30T16:44:19+08:00 wizardforcel 562826179@qq.com https://gitcode.net/OpenDocCN/epub-crawler/-/commit/44a8fe6e2c5087fc33ec9b7e9b8a095f3489df0d 2022-05-30 16:45:11 2022-05-30T16:45:11+08:00 wizardforcel 562826179@qq.com https://gitcode.net/OpenDocCN/epub-crawler/-/commit/7c92745047c2b3f3148744a9c1eeb6b5d16faa2b 2022-05-30 16:45:40 2022-05-30T16:45:40+08:00 wizardforcel 562826179@qq.com https://gitcode.net/OpenDocCN/epub-crawler/-/commit/4b1fffa0aa6f3bcefe3772983e210488969cf66f 2022-05-30 16:45:57 2022-05-30T16:45:57+08:00 wizardforcel 562826179@qq.com
# 历史记录
v2022.5.30.0
+ 修复带有 XML 标签的文档爬取失败的问题
v2022.3.25.0
+ 新增缓存功能
......
......@@ -10,4 +10,4 @@ from . import util
__author__ = "ApacheCN"
__email__ = "apachecn@163.com"
__license__ = "SATA"
__version__ = "2022.3.25.0"
__version__ = "2022.5.30.0"
......@@ -65,6 +65,9 @@ def get_toc(html, base):
return res
def get_article(html, url):
# 预处理掉 XML 声明和命名空间
html = re.sub(r'<\?xml[^>]*\?>', '', html)
html = re.sub(r'xmlns=".+?"', '', html)
root = pq(html)
if config['remove']:
......